No OneTemporary
Actions

Size

487 KB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c (revision 284762)
	@@ -1,1845 +1,1988 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2013, Joyent, Inc. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#include <sys/cred.h>
	#include <sys/zfs_context.h>
	#include <sys/dmu_objset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dsl_deleg.h>
	#include <sys/dnode.h>
	#include <sys/dbuf.h>
	#include <sys/zvol.h>
	#include <sys/dmu_tx.h>
	#include <sys/zap.h>
	#include <sys/zil.h>
	#include <sys/dmu_impl.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/sa.h>
	#include <sys/zfs_onexit.h>
	#include <sys/dsl_destroy.h>
	+#include <sys/vdev.h>

	/*
	* Needed to close a window in dnode_move() that allows the objset to be freed
	* before it can be safely accessed.
	*/
	krwlock_t os_lock;

	+/*
	+ * Tunable to overwrite the maximum number of threads for the parallization
	+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
	+ * datasets.
	+ * Default is 4 times the number of leaf vdevs.
	+ */
	+int dmu_find_threads = 0;
	+
	+static void dmu_objset_find_dp_cb(void *arg);
	+
	void
	dmu_objset_init(void)
	{
	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
	}

	void
	dmu_objset_fini(void)
	{
	rw_destroy(&os_lock);
	}

	spa_t *
	dmu_objset_spa(objset_t *os)
	{
	return (os->os_spa);
	}

	zilog_t *
	dmu_objset_zil(objset_t *os)
	{
	return (os->os_zil);
	}

	dsl_pool_t *
	dmu_objset_pool(objset_t *os)
	{
	dsl_dataset_t *ds;

	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
	return (ds->ds_dir->dd_pool);
	else
	return (spa_get_dsl(os->os_spa));
	}

	dsl_dataset_t *
	dmu_objset_ds(objset_t *os)
	{
	return (os->os_dsl_dataset);
	}

	dmu_objset_type_t
	dmu_objset_type(objset_t *os)
	{
	return (os->os_phys->os_type);
	}

	void
	dmu_objset_name(objset_t os, char buf)
	{
	dsl_dataset_name(os->os_dsl_dataset, buf);
	}

	uint64_t
	dmu_objset_id(objset_t *os)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;

	return (ds ? ds->ds_object : 0);
	}

	zfs_sync_type_t
	dmu_objset_syncprop(objset_t *os)
	{
	return (os->os_sync);
	}

	zfs_logbias_op_t
	dmu_objset_logbias(objset_t *os)
	{
	return (os->os_logbias);
	}

	static void
	checksum_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance should have been done by now.
	*/
	ASSERT(newval != ZIO_CHECKSUM_INHERIT);

	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
	}

	static void
	compression_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval != ZIO_COMPRESS_INHERIT);

	os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
	}

	static void
	copies_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval > 0);
	ASSERT(newval <= spa_max_replication(os->os_spa));

	os->os_copies = newval;
	}

	static void
	dedup_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;
	spa_t *spa = os->os_spa;
	enum zio_checksum checksum;

	/*
	* Inheritance should have been done by now.
	*/
	ASSERT(newval != ZIO_CHECKSUM_INHERIT);

	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);

	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
	}

	static void
	primary_cache_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval == ZFS_CACHE_ALL \|\| newval == ZFS_CACHE_NONE \|\|
	newval == ZFS_CACHE_METADATA);

	os->os_primary_cache = newval;
	}

	static void
	secondary_cache_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval == ZFS_CACHE_ALL \|\| newval == ZFS_CACHE_NONE \|\|
	newval == ZFS_CACHE_METADATA);

	os->os_secondary_cache = newval;
	}

	static void
	sync_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval == ZFS_SYNC_STANDARD \|\| newval == ZFS_SYNC_ALWAYS \|\|
	newval == ZFS_SYNC_DISABLED);

	os->os_sync = newval;
	if (os->os_zil)
	zil_set_sync(os->os_zil, newval);
	}

	static void
	redundant_metadata_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	/*
	* Inheritance and range checking should have been done by now.
	*/
	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL \|\|
	newval == ZFS_REDUNDANT_METADATA_MOST);

	os->os_redundant_metadata = newval;
	}

	static void
	logbias_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	ASSERT(newval == ZFS_LOGBIAS_LATENCY \|\|
	newval == ZFS_LOGBIAS_THROUGHPUT);
	os->os_logbias = newval;
	if (os->os_zil)
	zil_set_logbias(os->os_zil, newval);
	}

	static void
	recordsize_changed_cb(void *arg, uint64_t newval)
	{
	objset_t *os = arg;

	os->os_recordsize = newval;
	}

	void
	dmu_objset_byteswap(void *buf, size_t size)
	{
	objset_phys_t *osp = buf;

	ASSERT(size == OBJSET_OLD_PHYS_SIZE \|\| size == sizeof (objset_phys_t));
	dnode_byteswap(&osp->os_meta_dnode);
	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
	osp->os_type = BSWAP_64(osp->os_type);
	osp->os_flags = BSWAP_64(osp->os_flags);
	if (size == sizeof (objset_phys_t)) {
	dnode_byteswap(&osp->os_userused_dnode);
	dnode_byteswap(&osp->os_groupused_dnode);
	}
	}

	int
	dmu_objset_open_impl(spa_t spa, dsl_dataset_t ds, blkptr_t *bp,
	objset_t **osp)
	{
	objset_t *os;
	int i, err;

	ASSERT(ds == NULL \|\| MUTEX_HELD(&ds->ds_opening_lock));

	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
	os->os_dsl_dataset = ds;
	os->os_spa = spa;
	os->os_rootbp = bp;
	if (!BP_IS_HOLE(os->os_rootbp)) {
	arc_flags_t aflags = ARC_FLAG_WAIT;
	zbookmark_phys_t zb;
	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
	ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);

	if (DMU_OS_IS_L2CACHEABLE(os))
	aflags \|= ARC_FLAG_L2CACHE;
	if (DMU_OS_IS_L2COMPRESSIBLE(os))
	aflags \|= ARC_FLAG_L2COMPRESS;

	dprintf_bp(os->os_rootbp, "reading %s", "");
	err = arc_read(NULL, spa, os->os_rootbp,
	arc_getbuf_func, &os->os_phys_buf,
	ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
	if (err != 0) {
	kmem_free(os, sizeof (objset_t));
	/* convert checksum errors into IO errors */
	if (err == ECKSUM)
	err = SET_ERROR(EIO);
	return (err);
	}

	/* Increase the blocksize if we are permitted. */
	if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
	arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
	arc_buf_t *buf = arc_buf_alloc(spa,
	sizeof (objset_phys_t), &os->os_phys_buf,
	ARC_BUFC_METADATA);
	bzero(buf->b_data, sizeof (objset_phys_t));
	bcopy(os->os_phys_buf->b_data, buf->b_data,
	arc_buf_size(os->os_phys_buf));
	(void) arc_buf_remove_ref(os->os_phys_buf,
	&os->os_phys_buf);
	os->os_phys_buf = buf;
	}

	os->os_phys = os->os_phys_buf->b_data;
	os->os_flags = os->os_phys->os_flags;
	} else {
	int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
	sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
	os->os_phys_buf = arc_buf_alloc(spa, size,
	&os->os_phys_buf, ARC_BUFC_METADATA);
	os->os_phys = os->os_phys_buf->b_data;
	bzero(os->os_phys, size);
	}

	/*
	* Note: the changed_cb will be called once before the register
	* func returns, thus changing the checksum/compression from the
	* default (fletcher2/off). Snapshots don't need to know about
	* checksum/compression/copies.
	*/
	if (ds != NULL) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
	primary_cache_changed_cb, os);
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
	secondary_cache_changed_cb, os);
	}
	if (!ds->ds_is_snapshot) {
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_CHECKSUM),
	checksum_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_COMPRESSION),
	compression_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_COPIES),
	copies_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_DEDUP),
	dedup_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_LOGBIAS),
	logbias_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_SYNC),
	sync_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(
	ZFS_PROP_REDUNDANT_METADATA),
	redundant_metadata_changed_cb, os);
	}
	if (err == 0) {
	err = dsl_prop_register(ds,
	zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
	recordsize_changed_cb, os);
	}
	}
	if (err != 0) {
	VERIFY(arc_buf_remove_ref(os->os_phys_buf,
	&os->os_phys_buf));
	kmem_free(os, sizeof (objset_t));
	return (err);
	}
	} else {
	/* It's the meta-objset. */
	os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
	os->os_compress = ZIO_COMPRESS_LZJB;
	os->os_copies = spa_max_replication(spa);
	os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
	os->os_dedup_verify = B_FALSE;
	os->os_logbias = ZFS_LOGBIAS_LATENCY;
	os->os_sync = ZFS_SYNC_STANDARD;
	os->os_primary_cache = ZFS_CACHE_ALL;
	os->os_secondary_cache = ZFS_CACHE_ALL;
	}

	if (ds == NULL \|\| !ds->ds_is_snapshot)
	os->os_zil_header = os->os_phys->os_zil_header;
	os->os_zil = zil_alloc(os, &os->os_zil_header);

	for (i = 0; i < TXG_SIZE; i++) {
	list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
	offsetof(dnode_t, dn_dirty_link[i]));
	list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
	offsetof(dnode_t, dn_dirty_link[i]));
	}
	list_create(&os->os_dnodes, sizeof (dnode_t),
	offsetof(dnode_t, dn_link));
	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
	offsetof(dmu_buf_impl_t, db_link));

	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);

	dnode_special_open(os, &os->os_phys->os_meta_dnode,
	DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
	dnode_special_open(os, &os->os_phys->os_userused_dnode,
	DMU_USERUSED_OBJECT, &os->os_userused_dnode);
	dnode_special_open(os, &os->os_phys->os_groupused_dnode,
	DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
	}

	*osp = os;
	return (0);
	}

	int
	dmu_objset_from_ds(dsl_dataset_t ds, objset_t *osp)
	{
	int err = 0;

	mutex_enter(&ds->ds_opening_lock);
	if (ds->ds_objset == NULL) {
	objset_t *os;
	err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
	ds, dsl_dataset_get_blkptr(ds), &os);

	if (err == 0) {
	mutex_enter(&ds->ds_lock);
	ASSERT(ds->ds_objset == NULL);
	ds->ds_objset = os;
	mutex_exit(&ds->ds_lock);
	}
	}
	*osp = ds->ds_objset;
	mutex_exit(&ds->ds_opening_lock);
	return (err);
	}

	/*
	* Holds the pool while the objset is held. Therefore only one objset
	* can be held at a time.
	*/
	int
	dmu_objset_hold(const char name, void tag, objset_t **osp)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *ds;
	int err;

	err = dsl_pool_hold(name, tag, &dp);
	if (err != 0)
	return (err);
	err = dsl_dataset_hold(dp, name, tag, &ds);
	if (err != 0) {
	dsl_pool_rele(dp, tag);
	return (err);
	}

	err = dmu_objset_from_ds(ds, osp);
	if (err != 0) {
	dsl_dataset_rele(ds, tag);
	dsl_pool_rele(dp, tag);
	}

	return (err);
	}

	+static int
	+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
	+ boolean_t readonly, void tag, objset_t *osp)
	+{
	+ int err;
	+
	+ err = dmu_objset_from_ds(ds, osp);
	+ if (err != 0) {
	+ dsl_dataset_disown(ds, tag);
	+ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
	+ dsl_dataset_disown(ds, tag);
	+ return (SET_ERROR(EINVAL));
	+ } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
	+ dsl_dataset_disown(ds, tag);
	+ return (SET_ERROR(EROFS));
	+ }
	+ return (err);
	+}
	+
	/*
	* dsl_pool must not be held when this is called.
	* Upon successful return, there will be a longhold on the dataset,
	* and the dsl_pool will not be held.
	*/
	int
	dmu_objset_own(const char *name, dmu_objset_type_t type,
	boolean_t readonly, void tag, objset_t *osp)
	{
	dsl_pool_t *dp;
	dsl_dataset_t *ds;
	int err;

	err = dsl_pool_hold(name, FTAG, &dp);
	if (err != 0)
	return (err);
	err = dsl_dataset_own(dp, name, tag, &ds);
	if (err != 0) {
	dsl_pool_rele(dp, FTAG);
	return (err);
	}
	-
	- err = dmu_objset_from_ds(ds, osp);
	+ err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
	dsl_pool_rele(dp, FTAG);
	- if (err != 0) {
	- dsl_dataset_disown(ds, tag);
	- } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
	- dsl_dataset_disown(ds, tag);
	- return (SET_ERROR(EINVAL));
	- } else if (!readonly && ds->ds_is_snapshot) {
	- dsl_dataset_disown(ds, tag);
	- return (SET_ERROR(EROFS));
	- }
	+
	return (err);
	}

	+int
	+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
	+ boolean_t readonly, void tag, objset_t *osp)
	+{
	+ dsl_dataset_t *ds;
	+ int err;
	+
	+ err = dsl_dataset_own_obj(dp, obj, tag, &ds);
	+ if (err != 0)
	+ return (err);
	+
	+ return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
	+}
	+
	void
	dmu_objset_rele(objset_t os, void tag)
	{
	dsl_pool_t *dp = dmu_objset_pool(os);
	dsl_dataset_rele(os->os_dsl_dataset, tag);
	dsl_pool_rele(dp, tag);
	}

	/*
	* When we are called, os MUST refer to an objset associated with a dataset
	* that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
	* == tag. We will then release and reacquire ownership of the dataset while
	* holding the pool config_rwlock to avoid intervening namespace or ownership
	* changes may occur.
	*
	* This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
	* release the hold on its dataset and acquire a new one on the dataset of the
	* same name so that it can be partially torn down and reconstructed.
	*/
	void
	dmu_objset_refresh_ownership(objset_t os, void tag)
	{
	dsl_pool_t *dp;
	dsl_dataset_t ds, newds;
	char name[MAXNAMELEN];

	ds = os->os_dsl_dataset;
	VERIFY3P(ds, !=, NULL);
	VERIFY3P(ds->ds_owner, ==, tag);
	VERIFY(dsl_dataset_long_held(ds));

	dsl_dataset_name(ds, name);
	dp = dmu_objset_pool(os);
	dsl_pool_config_enter(dp, FTAG);
	dmu_objset_disown(os, tag);
	VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
	VERIFY3P(newds, ==, os->os_dsl_dataset);
	dsl_pool_config_exit(dp, FTAG);
	}

	void
	dmu_objset_disown(objset_t os, void tag)
	{
	dsl_dataset_disown(os->os_dsl_dataset, tag);
	}

	void
	dmu_objset_evict_dbufs(objset_t *os)
	{
	dnode_t dn_marker;
	dnode_t *dn;

	mutex_enter(&os->os_lock);
	dn = list_head(&os->os_dnodes);
	while (dn != NULL) {
	/*
	* Skip dnodes without holds. We have to do this dance
	* because dnode_add_ref() only works if there is already a
	* hold. If the dnode has no holds, then it has no dbufs.
	*/
	if (dnode_add_ref(dn, FTAG)) {
	list_insert_after(&os->os_dnodes, dn, &dn_marker);
	mutex_exit(&os->os_lock);

	dnode_evict_dbufs(dn);
	dnode_rele(dn, FTAG);

	mutex_enter(&os->os_lock);
	dn = list_next(&os->os_dnodes, &dn_marker);
	list_remove(&os->os_dnodes, &dn_marker);
	} else {
	dn = list_next(&os->os_dnodes, dn);
	}
	}
	mutex_exit(&os->os_lock);

	if (DMU_USERUSED_DNODE(os) != NULL) {
	dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
	dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
	}
	dnode_evict_dbufs(DMU_META_DNODE(os));
	}

	/*
	* Objset eviction processing is split into into two pieces.
	* The first marks the objset as evicting, evicts any dbufs that
	* have a refcount of zero, and then queues up the objset for the
	* second phase of eviction. Once os->os_dnodes has been cleared by
	* dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
	* The second phase closes the special dnodes, dequeues the objset from
	* the list of those undergoing eviction, and finally frees the objset.
	*
	* NOTE: Due to asynchronous eviction processing (invocation of
	* dnode_buf_pageout()), it is possible for the meta dnode for the
	* objset to have no holds even though os->os_dnodes is not empty.
	*/
	void
	dmu_objset_evict(objset_t *os)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;

	for (int t = 0; t < TXG_SIZE; t++)
	ASSERT(!dmu_objset_is_dirty(os, t));

	if (ds) {
	if (!ds->ds_is_snapshot) {
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_CHECKSUM),
	checksum_changed_cb, os));
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_COMPRESSION),
	compression_changed_cb, os));
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_COPIES),
	copies_changed_cb, os));
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_DEDUP),
	dedup_changed_cb, os));
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_LOGBIAS),
	logbias_changed_cb, os));
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_SYNC),
	sync_changed_cb, os));
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
	redundant_metadata_changed_cb, os));
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
	recordsize_changed_cb, os));
	}
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
	primary_cache_changed_cb, os));
	VERIFY0(dsl_prop_unregister(ds,
	zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
	secondary_cache_changed_cb, os));
	}

	if (os->os_sa)
	sa_tear_down(os);

	os->os_evicting = B_TRUE;
	dmu_objset_evict_dbufs(os);

	mutex_enter(&os->os_lock);
	spa_evicting_os_register(os->os_spa, os);
	if (list_is_empty(&os->os_dnodes)) {
	mutex_exit(&os->os_lock);
	dmu_objset_evict_done(os);
	} else {
	mutex_exit(&os->os_lock);
	}
	}

	void
	dmu_objset_evict_done(objset_t *os)
	{
	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);

	dnode_special_close(&os->os_meta_dnode);
	if (DMU_USERUSED_DNODE(os)) {
	dnode_special_close(&os->os_userused_dnode);
	dnode_special_close(&os->os_groupused_dnode);
	}
	zil_free(os->os_zil);

	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));

	/*
	* This is a barrier to prevent the objset from going away in
	* dnode_move() until we can safely ensure that the objset is still in
	* use. We consider the objset valid before the barrier and invalid
	* after the barrier.
	*/
	rw_enter(&os_lock, RW_READER);
	rw_exit(&os_lock);

	mutex_destroy(&os->os_lock);
	mutex_destroy(&os->os_obj_lock);
	mutex_destroy(&os->os_user_ptr_lock);
	spa_evicting_os_deregister(os->os_spa, os);
	kmem_free(os, sizeof (objset_t));
	}

	timestruc_t
	dmu_objset_snap_cmtime(objset_t *os)
	{
	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
	}

	/* called from dsl for meta-objset */
	objset_t *
	dmu_objset_create_impl(spa_t spa, dsl_dataset_t ds, blkptr_t *bp,
	dmu_objset_type_t type, dmu_tx_t *tx)
	{
	objset_t *os;
	dnode_t *mdn;

	ASSERT(dmu_tx_is_syncing(tx));

	if (ds != NULL)
	VERIFY0(dmu_objset_from_ds(ds, &os));
	else
	VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));

	mdn = DMU_META_DNODE(os);

	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
	DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);

	/*
	* We don't want to have to increase the meta-dnode's nlevels
	* later, because then we could do it in quescing context while
	* we are also accessing it in open context.
	*
	* This precaution is not necessary for the MOS (ds == NULL),
	* because the MOS is only updated in syncing context.
	* This is most fortunate: the MOS is the only objset that
	* needs to be synced multiple times as spa_sync() iterates
	* to convergence, so minimizing its dn_nlevels matters.
	*/
	if (ds != NULL) {
	int levels = 1;

	/*
	* Determine the number of levels necessary for the meta-dnode
	* to contain DN_MAX_OBJECT dnodes.
	*/
	while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
	(levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
	DN_MAX_OBJECT * sizeof (dnode_phys_t))
	levels++;

	mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
	mdn->dn_nlevels = levels;
	}

	ASSERT(type != DMU_OST_NONE);
	ASSERT(type != DMU_OST_ANY);
	ASSERT(type < DMU_OST_NUMTYPES);
	os->os_phys->os_type = type;
	if (dmu_objset_userused_enabled(os)) {
	os->os_phys->os_flags \|= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
	os->os_flags = os->os_phys->os_flags;
	}

	dsl_dataset_dirty(ds, tx);

	return (os);
	}

	typedef struct dmu_objset_create_arg {
	const char *doca_name;
	cred_t *doca_cred;
	void (doca_userfunc)(objset_t os, void *arg,
	cred_t cr, dmu_tx_t tx);
	void *doca_userarg;
	dmu_objset_type_t doca_type;
	uint64_t doca_flags;
	} dmu_objset_create_arg_t;

	/ARGSUSED/
	static int
	dmu_objset_create_check(void arg, dmu_tx_t tx)
	{
	dmu_objset_create_arg_t *doca = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dir_t *pdd;
	const char *tail;
	int error;

	if (strchr(doca->doca_name, '@') != NULL)
	return (SET_ERROR(EINVAL));

	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
	if (error != 0)
	return (error);
	if (tail == NULL) {
	dsl_dir_rele(pdd, FTAG);
	return (SET_ERROR(EEXIST));
	}
	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
	doca->doca_cred);
	dsl_dir_rele(pdd, FTAG);

	return (error);
	}

	static void
	dmu_objset_create_sync(void arg, dmu_tx_t tx)
	{
	dmu_objset_create_arg_t *doca = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dir_t *pdd;
	const char *tail;
	dsl_dataset_t *ds;
	uint64_t obj;
	blkptr_t *bp;
	objset_t *os;

	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));

	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
	doca->doca_cred, tx);

	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
	bp = dsl_dataset_get_blkptr(ds);
	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
	ds, bp, doca->doca_type, tx);

	if (doca->doca_userfunc != NULL) {
	doca->doca_userfunc(os, doca->doca_userarg,
	doca->doca_cred, tx);
	}

	spa_history_log_internal_ds(ds, "create", tx, "");
	dsl_dataset_rele(ds, FTAG);
	dsl_dir_rele(pdd, FTAG);
	}

	int
	dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
	void (func)(objset_t os, void arg, cred_t cr, dmu_tx_t tx), void arg)
	{
	dmu_objset_create_arg_t doca;

	doca.doca_name = name;
	doca.doca_cred = CRED();
	doca.doca_flags = flags;
	doca.doca_userfunc = func;
	doca.doca_userarg = arg;
	doca.doca_type = type;

	return (dsl_sync_task(name,
	dmu_objset_create_check, dmu_objset_create_sync, &doca,
	5, ZFS_SPACE_CHECK_NORMAL));
	}

	typedef struct dmu_objset_clone_arg {
	const char *doca_clone;
	const char *doca_origin;
	cred_t *doca_cred;
	} dmu_objset_clone_arg_t;

	/ARGSUSED/
	static int
	dmu_objset_clone_check(void arg, dmu_tx_t tx)
	{
	dmu_objset_clone_arg_t *doca = arg;
	dsl_dir_t *pdd;
	const char *tail;
	int error;
	dsl_dataset_t *origin;
	dsl_pool_t *dp = dmu_tx_pool(tx);

	if (strchr(doca->doca_clone, '@') != NULL)
	return (SET_ERROR(EINVAL));

	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
	if (error != 0)
	return (error);
	if (tail == NULL) {
	dsl_dir_rele(pdd, FTAG);
	return (SET_ERROR(EEXIST));
	}

	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
	doca->doca_cred);
	if (error != 0) {
	dsl_dir_rele(pdd, FTAG);
	return (SET_ERROR(EDQUOT));
	}
	dsl_dir_rele(pdd, FTAG);

	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
	if (error != 0)
	return (error);

	/* You can only clone snapshots, not the head datasets. */
	if (!origin->ds_is_snapshot) {
	dsl_dataset_rele(origin, FTAG);
	return (SET_ERROR(EINVAL));
	}
	dsl_dataset_rele(origin, FTAG);

	return (0);
	}

	static void
	dmu_objset_clone_sync(void arg, dmu_tx_t tx)
	{
	dmu_objset_clone_arg_t *doca = arg;
	dsl_pool_t *dp = dmu_tx_pool(tx);
	dsl_dir_t *pdd;
	const char *tail;
	dsl_dataset_t origin, ds;
	uint64_t obj;
	char namebuf[MAXNAMELEN];

	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));

	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
	doca->doca_cred, tx);

	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
	dsl_dataset_name(origin, namebuf);
	spa_history_log_internal_ds(ds, "clone", tx,
	"origin=%s (%llu)", namebuf, origin->ds_object);
	dsl_dataset_rele(ds, FTAG);
	dsl_dataset_rele(origin, FTAG);
	dsl_dir_rele(pdd, FTAG);
	}

	int
	dmu_objset_clone(const char clone, const char origin)
	{
	dmu_objset_clone_arg_t doca;

	doca.doca_clone = clone;
	doca.doca_origin = origin;
	doca.doca_cred = CRED();

	return (dsl_sync_task(clone,
	dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
	5, ZFS_SPACE_CHECK_NORMAL));
	}

	int
	dmu_objset_snapshot_one(const char fsname, const char snapname)
	{
	int err;
	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
	nvlist_t *snaps = fnvlist_alloc();

	fnvlist_add_boolean(snaps, longsnap);
	strfree(longsnap);
	err = dsl_dataset_snapshot(snaps, NULL, NULL);
	fnvlist_free(snaps);
	return (err);
	}

	static void
	dmu_objset_sync_dnodes(list_t list, list_t newlist, dmu_tx_t *tx)
	{
	dnode_t *dn;

	while (dn = list_head(list)) {
	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
	ASSERT(dn->dn_dbuf->db_data_pending);
	/*
	* Initialize dn_zio outside dnode_sync() because the
	* meta-dnode needs to set it ouside dnode_sync().
	*/
	dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
	ASSERT(dn->dn_zio);

	ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
	list_remove(list, dn);

	if (newlist) {
	(void) dnode_add_ref(dn, newlist);
	list_insert_tail(newlist, dn);
	}

	dnode_sync(dn, tx);
	}
	}

	/* ARGSUSED */
	static void
	dmu_objset_write_ready(zio_t zio, arc_buf_t abuf, void *arg)
	{
	blkptr_t *bp = zio->io_bp;
	objset_t *os = arg;
	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;

	ASSERT(!BP_IS_EMBEDDED(bp));
	ASSERT3P(bp, ==, os->os_rootbp);
	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
	ASSERT0(BP_GET_LEVEL(bp));

	/*
	* Update rootbp fill count: it should be the number of objects
	* allocated in the object set (not counting the "special"
	* objects that are stored in the objset_phys_t -- the meta
	* dnode and user/group accounting objects).
	*/
	bp->blk_fill = 0;
	for (int i = 0; i < dnp->dn_nblkptr; i++)
	bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
	}

	/* ARGSUSED */
	static void
	dmu_objset_write_done(zio_t zio, arc_buf_t abuf, void *arg)
	{
	blkptr_t *bp = zio->io_bp;
	blkptr_t *bp_orig = &zio->io_bp_orig;
	objset_t *os = arg;

	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
	ASSERT(BP_EQUAL(bp, bp_orig));
	} else {
	dsl_dataset_t *ds = os->os_dsl_dataset;
	dmu_tx_t *tx = os->os_synctx;

	(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
	dsl_dataset_block_born(ds, bp, tx);
	}
	}

	/* called from dsl */
	void
	dmu_objset_sync(objset_t os, zio_t pio, dmu_tx_t *tx)
	{
	int txgoff;
	zbookmark_phys_t zb;
	zio_prop_t zp;
	zio_t *zio;
	list_t *list;
	list_t *newlist = NULL;
	dbuf_dirty_record_t *dr;

	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);

	ASSERT(dmu_tx_is_syncing(tx));
	/* XXX the write_done callback should really give us the tx... */
	os->os_synctx = tx;

	if (os->os_dsl_dataset == NULL) {
	/*
	* This is the MOS. If we have upgraded,
	* spa_max_replication() could change, so reset
	* os_copies here.
	*/
	os->os_copies = spa_max_replication(os->os_spa);
	}

	/*
	* Create the root block IO
	*/
	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
	os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
	ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	arc_release(os->os_phys_buf, &os->os_phys_buf);

	dmu_write_policy(os, NULL, 0, 0, &zp);

	zio = arc_write(pio, os->os_spa, tx->tx_txg,
	os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
	DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
	NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
	ZIO_FLAG_MUSTSUCCEED, &zb);

	/*
	* Sync special dnodes - the parent IO for the sync is the root block
	*/
	DMU_META_DNODE(os)->dn_zio = zio;
	dnode_sync(DMU_META_DNODE(os), tx);

	os->os_phys->os_flags = os->os_flags;

	if (DMU_USERUSED_DNODE(os) &&
	DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
	DMU_USERUSED_DNODE(os)->dn_zio = zio;
	dnode_sync(DMU_USERUSED_DNODE(os), tx);
	DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
	dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
	}

	txgoff = tx->tx_txg & TXG_MASK;

	if (dmu_objset_userused_enabled(os)) {
	newlist = &os->os_synced_dnodes;
	/*
	* We must create the list here because it uses the
	* dn_dirty_link[] of this txg.
	*/
	list_create(newlist, sizeof (dnode_t),
	offsetof(dnode_t, dn_dirty_link[txgoff]));
	}

	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);

	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
	while (dr = list_head(list)) {
	ASSERT0(dr->dr_dbuf->db_level);
	list_remove(list, dr);
	if (dr->dr_zio)
	zio_nowait(dr->dr_zio);
	}
	/*
	* Free intent log blocks up to this tx.
	*/
	zil_sync(os->os_zil, tx);
	os->os_phys->os_zil_header = os->os_zil_header;
	zio_nowait(zio);
	}

	boolean_t
	dmu_objset_is_dirty(objset_t *os, uint64_t txg)
	{
	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) \|\|
	!list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
	}

	static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];

	void
	dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
	{
	used_cbs[ost] = cb;
	}

	boolean_t
	dmu_objset_userused_enabled(objset_t *os)
	{
	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
	used_cbs[os->os_phys->os_type] != NULL &&
	DMU_USERUSED_DNODE(os) != NULL);
	}

	static void
	do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
	uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
	{
	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
	int64_t delta = DNODE_SIZE + used;
	if (subtract)
	delta = -delta;
	VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
	user, delta, tx));
	VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
	group, delta, tx));
	}
	}

	void
	dmu_objset_do_userquota_updates(objset_t os, dmu_tx_t tx)
	{
	dnode_t *dn;
	list_t *list = &os->os_synced_dnodes;

	ASSERT(list_head(list) == NULL \|\| dmu_objset_userused_enabled(os));

	while (dn = list_head(list)) {
	int flags;
	ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE \|\|
	dn->dn_phys->dn_flags &
	DNODE_FLAG_USERUSED_ACCOUNTED);

	/* Allocate the user/groupused objects if necessary. */
	if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
	VERIFY(0 == zap_create_claim(os,
	DMU_USERUSED_OBJECT,
	DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
	VERIFY(0 == zap_create_claim(os,
	DMU_GROUPUSED_OBJECT,
	DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
	}

	/*
	* We intentionally modify the zap object even if the
	* net delta is zero. Otherwise
	* the block of the zap obj could be shared between
	* datasets but need to be different between them after
	* a bprewrite.
	*/

	flags = dn->dn_id_flags;
	ASSERT(flags);
	if (flags & DN_ID_OLD_EXIST) {
	do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
	dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
	}
	if (flags & DN_ID_NEW_EXIST) {
	do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
	dn->dn_phys->dn_flags, dn->dn_newuid,
	dn->dn_newgid, B_FALSE, tx);
	}

	mutex_enter(&dn->dn_mtx);
	dn->dn_oldused = 0;
	dn->dn_oldflags = 0;
	if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
	dn->dn_olduid = dn->dn_newuid;
	dn->dn_oldgid = dn->dn_newgid;
	dn->dn_id_flags \|= DN_ID_OLD_EXIST;
	if (dn->dn_bonuslen == 0)
	dn->dn_id_flags \|= DN_ID_CHKED_SPILL;
	else
	dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	}
	dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
	mutex_exit(&dn->dn_mtx);

	list_remove(list, dn);
	dnode_rele(dn, list);
	}
	}

	/*
	* Returns a pointer to data to find uid/gid from
	*
	* If a dirty record for transaction group that is syncing can't
	* be found then NULL is returned. In the NULL case it is assumed
	* the uid/gid aren't changing.
	*/
	static void *
	dmu_objset_userquota_find_data(dmu_buf_impl_t db, dmu_tx_t tx)
	{
	dbuf_dirty_record_t dr, *drp;
	void *data;

	if (db->db_dirtycnt == 0)
	return (db->db.db_data); /* Nothing is changing */

	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
	if (dr->dr_txg == tx->tx_txg)
	break;

	if (dr == NULL) {
	data = NULL;
	} else {
	dnode_t *dn;

	DB_DNODE_ENTER(dr->dr_dbuf);
	dn = DB_DNODE(dr->dr_dbuf);

	if (dn->dn_bonuslen == 0 &&
	dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
	data = dr->dt.dl.dr_data->b_data;
	else
	data = dr->dt.dl.dr_data;

	DB_DNODE_EXIT(dr->dr_dbuf);
	}

	return (data);
	}

	void
	dmu_objset_userquota_get_ids(dnode_t dn, boolean_t before, dmu_tx_t tx)
	{
	objset_t *os = dn->dn_objset;
	void *data = NULL;
	dmu_buf_impl_t *db = NULL;
	uint64_t *user = NULL;
	uint64_t *group = NULL;
	int flags = dn->dn_id_flags;
	int error;
	boolean_t have_spill = B_FALSE;

	if (!dmu_objset_userused_enabled(dn->dn_objset))
	return;

	if (before && (flags & (DN_ID_CHKED_BONUS\|DN_ID_OLD_EXIST\|
	DN_ID_CHKED_SPILL)))
	return;

	if (before && dn->dn_bonuslen != 0)
	data = DN_BONUS(dn->dn_phys);
	else if (!before && dn->dn_bonuslen != 0) {
	if (dn->dn_bonus) {
	db = dn->dn_bonus;
	mutex_enter(&db->db_mtx);
	data = dmu_objset_userquota_find_data(db, tx);
	} else {
	data = DN_BONUS(dn->dn_phys);
	}
	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
	int rf = 0;

	if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
	rf \|= DB_RF_HAVESTRUCT;
	error = dmu_spill_hold_by_dnode(dn,
	rf \| DB_RF_MUST_SUCCEED,
	FTAG, (dmu_buf_t **)&db);
	ASSERT(error == 0);
	mutex_enter(&db->db_mtx);
	data = (before) ? db->db.db_data :
	dmu_objset_userquota_find_data(db, tx);
	have_spill = B_TRUE;
	} else {
	mutex_enter(&dn->dn_mtx);
	dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	mutex_exit(&dn->dn_mtx);
	return;
	}

	if (before) {
	ASSERT(data);
	user = &dn->dn_olduid;
	group = &dn->dn_oldgid;
	} else if (data) {
	user = &dn->dn_newuid;
	group = &dn->dn_newgid;
	}

	/*
	* Must always call the callback in case the object
	* type has changed and that type isn't an object type to track
	*/
	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
	user, group);

	/*
	* Preserve existing uid/gid when the callback can't determine
	* what the new uid/gid are and the callback returned EEXIST.
	* The EEXIST error tells us to just use the existing uid/gid.
	* If we don't know what the old values are then just assign
	* them to 0, since that is a new file being created.
	*/
	if (!before && data == NULL && error == EEXIST) {
	if (flags & DN_ID_OLD_EXIST) {
	dn->dn_newuid = dn->dn_olduid;
	dn->dn_newgid = dn->dn_oldgid;
	} else {
	dn->dn_newuid = 0;
	dn->dn_newgid = 0;
	}
	error = 0;
	}

	if (db)
	mutex_exit(&db->db_mtx);

	mutex_enter(&dn->dn_mtx);
	if (error == 0 && before)
	dn->dn_id_flags \|= DN_ID_OLD_EXIST;
	if (error == 0 && !before)
	dn->dn_id_flags \|= DN_ID_NEW_EXIST;

	if (have_spill) {
	dn->dn_id_flags \|= DN_ID_CHKED_SPILL;
	} else {
	dn->dn_id_flags \|= DN_ID_CHKED_BONUS;
	}
	mutex_exit(&dn->dn_mtx);
	if (have_spill)
	dmu_buf_rele((dmu_buf_t *)db, FTAG);
	}

	boolean_t
	dmu_objset_userspace_present(objset_t *os)
	{
	return (os->os_phys->os_flags &
	OBJSET_FLAG_USERACCOUNTING_COMPLETE);
	}

	int
	dmu_objset_userspace_upgrade(objset_t *os)
	{
	uint64_t obj;
	int err = 0;

	if (dmu_objset_userspace_present(os))
	return (0);
	if (!dmu_objset_userused_enabled(os))
	return (SET_ERROR(ENOTSUP));
	if (dmu_objset_is_snapshot(os))
	return (SET_ERROR(EINVAL));

	/*
	* We simply need to mark every object dirty, so that it will be
	* synced out and now accounted. If this is called
	* concurrently, or if we already did some work before crashing,
	* that's fine, since we track each object's accounted state
	* independently.
	*/

	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
	dmu_tx_t *tx;
	dmu_buf_t *db;
	int objerr;

	if (issig(JUSTLOOKING) && issig(FORREAL))
	return (SET_ERROR(EINTR));

	objerr = dmu_bonus_hold(os, obj, FTAG, &db);
	if (objerr != 0)
	continue;
	tx = dmu_tx_create(os);
	dmu_tx_hold_bonus(tx, obj);
	objerr = dmu_tx_assign(tx, TXG_WAIT);
	if (objerr != 0) {
	dmu_tx_abort(tx);
	continue;
	}
	dmu_buf_will_dirty(db, tx);
	dmu_buf_rele(db, FTAG);
	dmu_tx_commit(tx);
	}

	os->os_flags \|= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
	txg_wait_synced(dmu_objset_pool(os), 0);
	return (0);
	}

	void
	dmu_objset_space(objset_t os, uint64_t refdbytesp, uint64_t *availbytesp,
	uint64_t usedobjsp, uint64_t availobjsp)
	{
	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
	usedobjsp, availobjsp);
	}

	uint64_t
	dmu_objset_fsid_guid(objset_t *os)
	{
	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
	}

	void
	dmu_objset_fast_stat(objset_t os, dmu_objset_stats_t stat)
	{
	stat->dds_type = os->os_phys->os_type;
	if (os->os_dsl_dataset)
	dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
	}

	void
	dmu_objset_stats(objset_t os, nvlist_t nv)
	{
	ASSERT(os->os_dsl_dataset \|\|
	os->os_phys->os_type == DMU_OST_META);

	if (os->os_dsl_dataset != NULL)
	dsl_dataset_stats(os->os_dsl_dataset, nv);

	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
	os->os_phys->os_type);
	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
	dmu_objset_userspace_present(os));
	}

	int
	dmu_objset_is_snapshot(objset_t *os)
	{
	if (os->os_dsl_dataset != NULL)
	return (os->os_dsl_dataset->ds_is_snapshot);
	else
	return (B_FALSE);
	}

	int
	dmu_snapshot_realname(objset_t os, char name, char *real, int maxlen,
	boolean_t *conflict)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;
	uint64_t ignored;

	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
	return (SET_ERROR(ENOENT));

	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
	dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
	MT_FIRST, real, maxlen, conflict));
	}

	int
	dmu_snapshot_list_next(objset_t os, int namelen, char name,
	uint64_t idp, uint64_t offp, boolean_t *case_conflict)
	{
	dsl_dataset_t *ds = os->os_dsl_dataset;
	zap_cursor_t cursor;
	zap_attribute_t attr;

	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));

	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
	return (SET_ERROR(ENOENT));

	zap_cursor_init_serialized(&cursor,
	ds->ds_dir->dd_pool->dp_meta_objset,
	dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);

	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
	zap_cursor_fini(&cursor);
	return (SET_ERROR(ENOENT));
	}

	if (strlen(attr.za_name) + 1 > namelen) {
	zap_cursor_fini(&cursor);
	return (SET_ERROR(ENAMETOOLONG));
	}

	(void) strcpy(name, attr.za_name);
	if (idp)
	*idp = attr.za_first_integer;
	if (case_conflict)
	*case_conflict = attr.za_normalization_conflict;
	zap_cursor_advance(&cursor);
	*offp = zap_cursor_serialize(&cursor);
	zap_cursor_fini(&cursor);

	return (0);
	}

	int
	dmu_dir_list_next(objset_t os, int namelen, char name,
	uint64_t idp, uint64_t offp)
	{
	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
	zap_cursor_t cursor;
	zap_attribute_t attr;

	/* there is no next dir on a snapshot! */
	if (os->os_dsl_dataset->ds_object !=
	dsl_dir_phys(dd)->dd_head_dataset_obj)
	return (SET_ERROR(ENOENT));

	zap_cursor_init_serialized(&cursor,
	dd->dd_pool->dp_meta_objset,
	dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);

	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
	zap_cursor_fini(&cursor);
	return (SET_ERROR(ENOENT));
	}

	if (strlen(attr.za_name) + 1 > namelen) {
	zap_cursor_fini(&cursor);
	return (SET_ERROR(ENAMETOOLONG));
	}

	(void) strcpy(name, attr.za_name);
	if (idp)
	*idp = attr.za_first_integer;
	zap_cursor_advance(&cursor);
	*offp = zap_cursor_serialize(&cursor);
	zap_cursor_fini(&cursor);

	return (0);
	}

	-/*
	- * Find objsets under and including ddobj, call func(ds) on each.
	- */
	-int
	-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
	- int func(dsl_pool_t , dsl_dataset_t , void ), void arg, int flags)
	+typedef struct dmu_objset_find_ctx {
	+ taskq_t *dc_tq;
	+ dsl_pool_t *dc_dp;
	+ uint64_t dc_ddobj;
	+ int (dc_func)(dsl_pool_t , dsl_dataset_t , void );
	+ void *dc_arg;
	+ int dc_flags;
	+ kmutex_t *dc_error_lock;
	+ int *dc_error;
	+} dmu_objset_find_ctx_t;
	+
	+static void
	+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
	{
	+ dsl_pool_t *dp = dcp->dc_dp;
	+ dmu_objset_find_ctx_t *child_dcp;
	dsl_dir_t *dd;
	dsl_dataset_t *ds;
	zap_cursor_t zc;
	zap_attribute_t *attr;
	uint64_t thisobj;
	- int err;
	+ int err = 0;

	- ASSERT(dsl_pool_config_held(dp));
	+ /* don't process if there already was an error */
	+ if (*dcp->dc_error != 0)
	+ goto out;

	- err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
	+ err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
	if (err != 0)
	- return (err);
	+ goto out;

	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
	if (dd->dd_myname[0] == '$') {
	dsl_dir_rele(dd, FTAG);
	- return (0);
	+ goto out;
	}

	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);

	/*
	* Iterate over all children.
	*/
	- if (flags & DS_FIND_CHILDREN) {
	+ if (dcp->dc_flags & DS_FIND_CHILDREN) {
	for (zap_cursor_init(&zc, dp->dp_meta_objset,
	dsl_dir_phys(dd)->dd_child_dir_zapobj);
	zap_cursor_retrieve(&zc, attr) == 0;
	(void) zap_cursor_advance(&zc)) {
	ASSERT3U(attr->za_integer_length, ==,
	sizeof (uint64_t));
	ASSERT3U(attr->za_num_integers, ==, 1);

	- err = dmu_objset_find_dp(dp, attr->za_first_integer,
	- func, arg, flags);
	- if (err != 0)
	- break;
	+ child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
	+ child_dcp = dcp;
	+ child_dcp->dc_ddobj = attr->za_first_integer;
	+ if (dcp->dc_tq != NULL)
	+ (void) taskq_dispatch(dcp->dc_tq,
	+ dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
	+ else
	+ dmu_objset_find_dp_impl(child_dcp);
	}
	zap_cursor_fini(&zc);
	-
	- if (err != 0) {
	- dsl_dir_rele(dd, FTAG);
	- kmem_free(attr, sizeof (zap_attribute_t));
	- return (err);
	- }
	}

	/*
	* Iterate over all snapshots.
	*/
	- if (flags & DS_FIND_SNAPSHOTS) {
	+ if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
	dsl_dataset_t *ds;
	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);

	if (err == 0) {
	uint64_t snapobj;

	snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	dsl_dataset_rele(ds, FTAG);

	for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
	zap_cursor_retrieve(&zc, attr) == 0;
	(void) zap_cursor_advance(&zc)) {
	ASSERT3U(attr->za_integer_length, ==,
	sizeof (uint64_t));
	ASSERT3U(attr->za_num_integers, ==, 1);

	err = dsl_dataset_hold_obj(dp,
	attr->za_first_integer, FTAG, &ds);
	if (err != 0)
	break;
	- err = func(dp, ds, arg);
	+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
	dsl_dataset_rele(ds, FTAG);
	if (err != 0)
	break;
	}
	zap_cursor_fini(&zc);
	}
	}

	dsl_dir_rele(dd, FTAG);
	kmem_free(attr, sizeof (zap_attribute_t));

	if (err != 0)
	- return (err);
	+ goto out;

	/*
	* Apply to self.
	*/
	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
	if (err != 0)
	- return (err);
	- err = func(dp, ds, arg);
	+ goto out;
	+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
	dsl_dataset_rele(ds, FTAG);
	- return (err);
	+
	+out:
	+ if (err != 0) {
	+ mutex_enter(dcp->dc_error_lock);
	+ /* only keep first error */
	+ if (*dcp->dc_error == 0)
	+ *dcp->dc_error = err;
	+ mutex_exit(dcp->dc_error_lock);
	+ }
	+
	+ kmem_free(dcp, sizeof (*dcp));
	+}
	+
	+static void
	+dmu_objset_find_dp_cb(void *arg)
	+{
	+ dmu_objset_find_ctx_t *dcp = arg;
	+ dsl_pool_t *dp = dcp->dc_dp;
	+
	+ dsl_pool_config_enter(dp, FTAG);
	+
	+ dmu_objset_find_dp_impl(dcp);
	+
	+ dsl_pool_config_exit(dp, FTAG);
	+}
	+
	+/*
	+ * Find objsets under and including ddobj, call func(ds) on each.
	+ * The order for the enumeration is completely undefined.
	+ * func is called with dsl_pool_config held.
	+ */
	+int
	+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
	+ int func(dsl_pool_t , dsl_dataset_t , void ), void arg, int flags)
	+{
	+ int error = 0;
	+ taskq_t *tq = NULL;
	+ int ntasks;
	+ dmu_objset_find_ctx_t *dcp;
	+ kmutex_t err_lock;
	+
	+ mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
	+ dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
	+ dcp->dc_tq = NULL;
	+ dcp->dc_dp = dp;
	+ dcp->dc_ddobj = ddobj;
	+ dcp->dc_func = func;
	+ dcp->dc_arg = arg;
	+ dcp->dc_flags = flags;
	+ dcp->dc_error_lock = &err_lock;
	+ dcp->dc_error = &error;
	+
	+ if ((flags & DS_FIND_SERIALIZE) \|\| dsl_pool_config_held_writer(dp)) {
	+ /*
	+ * In case a write lock is held we can't make use of
	+ * parallelism, as down the stack of the worker threads
	+ * the lock is asserted via dsl_pool_config_held.
	+ * In case of a read lock this is solved by getting a read
	+ * lock in each worker thread, which isn't possible in case
	+ * of a writer lock. So we fall back to the synchronous path
	+ * here.
	+ * In the future it might be possible to get some magic into
	+ * dsl_pool_config_held in a way that it returns true for
	+ * the worker threads so that a single lock held from this
	+ * thread suffices. For now, stay single threaded.
	+ */
	+ dmu_objset_find_dp_impl(dcp);
	+
	+ return (error);
	+ }
	+
	+ ntasks = dmu_find_threads;
	+ if (ntasks == 0)
	+ ntasks = vdev_count_leaves(dp->dp_spa) * 4;
	+ tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
	+ INT_MAX, 0);
	+ if (tq == NULL) {
	+ kmem_free(dcp, sizeof (*dcp));
	+ return (SET_ERROR(ENOMEM));
	+ }
	+ dcp->dc_tq = tq;
	+
	+ /* dcp will be freed by task */
	+ (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
	+
	+ /*
	+ * PORTING: this code relies on the property of taskq_wait to wait
	+ * until no more tasks are queued and no more tasks are active. As
	+ * we always queue new tasks from within other tasks, task_wait
	+ * reliably waits for the full recursion to finish, even though we
	+ * enqueue new tasks after taskq_wait has been called.
	+ * On platforms other than illumos, taskq_wait may not have this
	+ * property.
	+ */
	+ taskq_wait(tq);
	+ taskq_destroy(tq);
	+ mutex_destroy(&err_lock);
	+
	+ return (error);
	}

	/*
	* Find all objsets under name, and for each, call 'func(child_name, arg)'.
	* The dp_config_rwlock must not be held when this is called, and it
	* will not be held when the callback is called.
	* Therefore this function should only be used when the pool is not changing
	* (e.g. in syncing context), or the callback can deal with the possible races.
	*/
	static int
	dmu_objset_find_impl(spa_t spa, const char name,
	int func(const char , void ), void *arg, int flags)
	{
	dsl_dir_t *dd;
	dsl_pool_t *dp = spa_get_dsl(spa);
	dsl_dataset_t *ds;
	zap_cursor_t zc;
	zap_attribute_t *attr;
	char *child;
	uint64_t thisobj;
	int err;

	dsl_pool_config_enter(dp, FTAG);

	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
	if (err != 0) {
	dsl_pool_config_exit(dp, FTAG);
	return (err);
	}

	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
	if (dd->dd_myname[0] == '$') {
	dsl_dir_rele(dd, FTAG);
	dsl_pool_config_exit(dp, FTAG);
	return (0);
	}

	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);

	/*
	* Iterate over all children.
	*/
	if (flags & DS_FIND_CHILDREN) {
	for (zap_cursor_init(&zc, dp->dp_meta_objset,
	dsl_dir_phys(dd)->dd_child_dir_zapobj);
	zap_cursor_retrieve(&zc, attr) == 0;
	(void) zap_cursor_advance(&zc)) {
	ASSERT3U(attr->za_integer_length, ==,
	sizeof (uint64_t));
	ASSERT3U(attr->za_num_integers, ==, 1);

	child = kmem_asprintf("%s/%s", name, attr->za_name);
	dsl_pool_config_exit(dp, FTAG);
	err = dmu_objset_find_impl(spa, child,
	func, arg, flags);
	dsl_pool_config_enter(dp, FTAG);
	strfree(child);
	if (err != 0)
	break;
	}
	zap_cursor_fini(&zc);

	if (err != 0) {
	dsl_dir_rele(dd, FTAG);
	dsl_pool_config_exit(dp, FTAG);
	kmem_free(attr, sizeof (zap_attribute_t));
	return (err);
	}
	}

	/*
	* Iterate over all snapshots.
	*/
	if (flags & DS_FIND_SNAPSHOTS) {
	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);

	if (err == 0) {
	uint64_t snapobj;

	snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
	dsl_dataset_rele(ds, FTAG);

	for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
	zap_cursor_retrieve(&zc, attr) == 0;
	(void) zap_cursor_advance(&zc)) {
	ASSERT3U(attr->za_integer_length, ==,
	sizeof (uint64_t));
	ASSERT3U(attr->za_num_integers, ==, 1);

	child = kmem_asprintf("%s@%s",
	name, attr->za_name);
	dsl_pool_config_exit(dp, FTAG);
	err = func(child, arg);
	dsl_pool_config_enter(dp, FTAG);
	strfree(child);
	if (err != 0)
	break;
	}
	zap_cursor_fini(&zc);
	}
	}

	dsl_dir_rele(dd, FTAG);
	kmem_free(attr, sizeof (zap_attribute_t));
	dsl_pool_config_exit(dp, FTAG);

	if (err != 0)
	return (err);

	/* Apply to self. */
	return (func(name, arg));
	}

	/*
	* See comment above dmu_objset_find_impl().
	*/
	int
	dmu_objset_find(char name, int func(const char , void ), void arg,
	int flags)
	{
	spa_t *spa;
	int error;

	error = spa_open(name, &spa, FTAG);
	if (error != 0)
	return (error);
	error = dmu_objset_find_impl(spa, name, func, arg, flags);
	spa_close(spa, FTAG);
	return (error);
	}

	void
	dmu_objset_set_user(objset_t os, void user_ptr)
	{
	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
	os->os_user_ptr = user_ptr;
	}

	void *
	dmu_objset_get_user(objset_t *os)
	{
	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
	return (os->os_user_ptr);
	}

	/*
	* Determine name of filesystem, given name of snapshot.
	* buf must be at least MAXNAMELEN bytes
	*/
	int
	dmu_fsname(const char snapname, char buf)
	{
	char *atp = strchr(snapname, '@');
	if (atp == NULL)
	return (SET_ERROR(EINVAL));
	if (atp - snapname >= MAXNAMELEN)
	return (SET_ERROR(ENAMETOOLONG));
	(void) strlcpy(buf, snapname, atp - snapname + 1);
	return (0);
	}
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c (revision 284762)
	@@ -1,1058 +1,1064 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
	* Copyright (c) 2013 Steven Hartland. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	*/

	#include <sys/dsl_pool.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_synctask.h>
	#include <sys/dsl_scan.h>
	#include <sys/dnode.h>
	#include <sys/dmu_tx.h>
	#include <sys/dmu_objset.h>
	#include <sys/arc.h>
	#include <sys/zap.h>
	#include <sys/zio.h>
	#include <sys/zfs_context.h>
	#include <sys/fs/zfs.h>
	#include <sys/zfs_znode.h>
	#include <sys/spa_impl.h>
	#include <sys/dsl_deadlist.h>
	#include <sys/bptree.h>
	#include <sys/zfeature.h>
	#include <sys/zil_impl.h>
	#include <sys/dsl_userhold.h>

	/*
	* ZFS Write Throttle
	* ------------------
	*
	* ZFS must limit the rate of incoming writes to the rate at which it is able
	* to sync data modifications to the backend storage. Throttling by too much
	* creates an artificial limit; throttling by too little can only be sustained
	* for short periods and would lead to highly lumpy performance. On a per-pool
	* basis, ZFS tracks the amount of modified (dirty) data. As operations change
	* data, the amount of dirty data increases; as ZFS syncs out data, the amount
	* of dirty data decreases. When the amount of dirty data exceeds a
	* predetermined threshold further modifications are blocked until the amount
	* of dirty data decreases (as data is synced out).
	*
	* The limit on dirty data is tunable, and should be adjusted according to
	* both the IO capacity and available memory of the system. The larger the
	* window, the more ZFS is able to aggregate and amortize metadata (and data)
	* changes. However, memory is a limited resource, and allowing for more dirty
	* data comes at the cost of keeping other useful data in memory (for example
	* ZFS data cached by the ARC).
	*
	* Implementation
	*
	* As buffers are modified dsl_pool_willuse_space() increments both the per-
	* txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
	* dirty space used; dsl_pool_dirty_space() decrements those values as data
	* is synced out from dsl_pool_sync(). While only the poolwide value is
	* relevant, the per-txg value is useful for debugging. The tunable
	* zfs_dirty_data_max determines the dirty space limit. Once that value is
	* exceeded, new writes are halted until space frees up.
	*
	* The zfs_dirty_data_sync tunable dictates the threshold at which we
	* ensure that there is a txg syncing (see the comment in txg.c for a full
	* description of transaction group stages).
	*
	* The IO scheduler uses both the dirty space limit and current amount of
	* dirty data as inputs. Those values affect the number of concurrent IOs ZFS
	* issues. See the comment in vdev_queue.c for details of the IO scheduler.
	*
	* The delay is also calculated based on the amount of dirty data. See the
	* comment above dmu_tx_delay() for details.
	*/

	/*
	* zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
	* capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system.
	*/
	uint64_t zfs_dirty_data_max;
	uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
	int zfs_dirty_data_max_percent = 10;

	/*
	* If there is at least this much dirty data, push out a txg.
	*/
	uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;

	/*
	* Once there is this amount of dirty data, the dmu_tx_delay() will kick in
	* and delay each transaction.
	* This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
	*/
	int zfs_delay_min_dirty_percent = 60;

	/*
	* This controls how quickly the delay approaches infinity.
	* Larger values cause it to delay more for a given amount of dirty data.
	* Therefore larger values will cause there to be less dirty data for a
	* given throughput.
	*
	* For the smoothest delay, this value should be about 1 billion divided
	* by the maximum number of operations per second. This will smoothly
	* handle between 10x and 1/10th this number.
	*
	* Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
	* multiply in dmu_tx_delay().
	*/
	uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;


	hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
	hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);

	int
	dsl_pool_open_special_dir(dsl_pool_t dp, const char name, dsl_dir_t **ddp)
	{
	uint64_t obj;
	int err;

	err = zap_lookup(dp->dp_meta_objset,
	dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
	name, sizeof (obj), 1, &obj);
	if (err)
	return (err);

	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
	}

	static dsl_pool_t *
	dsl_pool_open_impl(spa_t *spa, uint64_t txg)
	{
	dsl_pool_t *dp;
	blkptr_t *bp = spa_get_rootblkptr(spa);

	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
	dp->dp_spa = spa;
	dp->dp_meta_rootbp = *bp;
	rrw_init(&dp->dp_config_rwlock, B_TRUE);
	txg_init(dp, txg);

	txg_list_create(&dp->dp_dirty_datasets,
	offsetof(dsl_dataset_t, ds_dirty_link));
	txg_list_create(&dp->dp_dirty_zilogs,
	offsetof(zilog_t, zl_dirty_link));
	txg_list_create(&dp->dp_dirty_dirs,
	offsetof(dsl_dir_t, dd_dirty_link));
	txg_list_create(&dp->dp_sync_tasks,
	offsetof(dsl_sync_task_t, dst_node));

	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);

	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
	1, 4, 0);

	return (dp);
	}

	int
	dsl_pool_init(spa_t spa, uint64_t txg, dsl_pool_t *dpp)
	{
	int err;
	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);

	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
	&dp->dp_meta_objset);
	if (err != 0)
	dsl_pool_close(dp);
	else
	*dpp = dp;

	return (err);
	}

	int
	dsl_pool_open(dsl_pool_t *dp)
	{
	int err;
	dsl_dir_t *dd;
	dsl_dataset_t *ds;
	uint64_t obj;

	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
	&dp->dp_root_dir_obj);
	if (err)
	goto out;

	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
	NULL, dp, &dp->dp_root_dir);
	if (err)
	goto out;

	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
	if (err)
	goto out;

	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
	err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
	if (err)
	goto out;
	err = dsl_dataset_hold_obj(dp,
	dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
	if (err == 0) {
	err = dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
	&dp->dp_origin_snap);
	dsl_dataset_rele(ds, FTAG);
	}
	dsl_dir_rele(dd, dp);
	if (err)
	goto out;
	}

	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
	err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
	&dp->dp_free_dir);
	if (err)
	goto out;

	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
	if (err)
	goto out;
	VERIFY0(bpobj_open(&dp->dp_free_bpobj,
	dp->dp_meta_objset, obj));
	}

	/*
	* Note: errors ignored, because the leak dir will not exist if we
	* have not encountered a leak yet.
	*/
	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
	&dp->dp_leak_dir);

	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
	&dp->dp_bptree_obj);
	if (err != 0)
	goto out;
	}

	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
	&dp->dp_empty_bpobj);
	if (err != 0)
	goto out;
	}

	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
	&dp->dp_tmp_userrefs_obj);
	if (err == ENOENT)
	err = 0;
	if (err)
	goto out;

	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);

	out:
	rrw_exit(&dp->dp_config_rwlock, FTAG);
	return (err);
	}

	void
	dsl_pool_close(dsl_pool_t *dp)
	{
	/*
	* Drop our references from dsl_pool_open().
	*
	* Since we held the origin_snap from "syncing" context (which
	* includes pool-opening context), it actually only got a "ref"
	* and not a hold, so just drop that here.
	*/
	if (dp->dp_origin_snap)
	dsl_dataset_rele(dp->dp_origin_snap, dp);
	if (dp->dp_mos_dir)
	dsl_dir_rele(dp->dp_mos_dir, dp);
	if (dp->dp_free_dir)
	dsl_dir_rele(dp->dp_free_dir, dp);
	if (dp->dp_leak_dir)
	dsl_dir_rele(dp->dp_leak_dir, dp);
	if (dp->dp_root_dir)
	dsl_dir_rele(dp->dp_root_dir, dp);

	bpobj_close(&dp->dp_free_bpobj);

	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
	if (dp->dp_meta_objset)
	dmu_objset_evict(dp->dp_meta_objset);

	txg_list_destroy(&dp->dp_dirty_datasets);
	txg_list_destroy(&dp->dp_dirty_zilogs);
	txg_list_destroy(&dp->dp_sync_tasks);
	txg_list_destroy(&dp->dp_dirty_dirs);

	/*
	* We can't set retry to TRUE since we're explicitly specifying
	* a spa to flush. This is good enough; any missed buffers for
	* this spa won't cause trouble, and they'll eventually fall
	* out of the ARC just like any other unused buffer.
	*/
	arc_flush(dp->dp_spa, FALSE);

	txg_fini(dp);
	dsl_scan_fini(dp);
	dmu_buf_user_evict_wait();

	rrw_destroy(&dp->dp_config_rwlock);
	mutex_destroy(&dp->dp_lock);
	taskq_destroy(dp->dp_vnrele_taskq);
	if (dp->dp_blkstats)
	kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
	kmem_free(dp, sizeof (dsl_pool_t));
	}

	dsl_pool_t *
	dsl_pool_create(spa_t spa, nvlist_t zplprops, uint64_t txg)
	{
	int err;
	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
	objset_t *os;
	dsl_dataset_t *ds;
	uint64_t obj;

	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);

	/* create and open the MOS (meta-objset) */
	dp->dp_meta_objset = dmu_objset_create_impl(spa,
	NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);

	/* create the pool directory */
	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
	ASSERT0(err);

	/* Initialize scan structures */
	VERIFY0(dsl_scan_init(dp, txg));

	/* create and open the root dir */
	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
	NULL, dp, &dp->dp_root_dir));

	/* create and open the meta-objset dir */
	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
	VERIFY0(dsl_pool_open_special_dir(dp,
	MOS_DIR_NAME, &dp->dp_mos_dir));

	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
	/* create and open the free dir */
	(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
	FREE_DIR_NAME, tx);
	VERIFY0(dsl_pool_open_special_dir(dp,
	FREE_DIR_NAME, &dp->dp_free_dir));

	/* create and open the free_bplist */
	obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
	VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
	VERIFY0(bpobj_open(&dp->dp_free_bpobj,
	dp->dp_meta_objset, obj));
	}

	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
	dsl_pool_create_origin(dp, tx);

	/* create the root dataset */
	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);

	/* create the root objset */
	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
	os = dmu_objset_create_impl(dp->dp_spa, ds,
	dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
	#ifdef _KERNEL
	zfs_create_fs(os, kcred, zplprops, tx);
	#endif
	dsl_dataset_rele(ds, FTAG);

	dmu_tx_commit(tx);

	rrw_exit(&dp->dp_config_rwlock, FTAG);

	return (dp);
	}

	/*
	* Account for the meta-objset space in its placeholder dsl_dir.
	*/
	void
	dsl_pool_mos_diduse_space(dsl_pool_t *dp,
	int64_t used, int64_t comp, int64_t uncomp)
	{
	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
	mutex_enter(&dp->dp_lock);
	dp->dp_mos_used_delta += used;
	dp->dp_mos_compressed_delta += comp;
	dp->dp_mos_uncompressed_delta += uncomp;
	mutex_exit(&dp->dp_lock);
	}

	static int
	deadlist_enqueue_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	dsl_deadlist_t *dl = arg;
	dsl_deadlist_insert(dl, bp, tx);
	return (0);
	}

	static void
	dsl_pool_sync_mos(dsl_pool_t dp, dmu_tx_t tx)
	{
	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
	VERIFY0(zio_wait(zio));
	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
	}

	static void
	dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
	{
	ASSERT(MUTEX_HELD(&dp->dp_lock));

	if (delta < 0)
	ASSERT3U(-delta, <=, dp->dp_dirty_total);

	dp->dp_dirty_total += delta;

	/*
	* Note: we signal even when increasing dp_dirty_total.
	* This ensures forward progress -- each thread wakes the next waiter.
	*/
	if (dp->dp_dirty_total <= zfs_dirty_data_max)
	cv_signal(&dp->dp_spaceavail_cv);
	}

	void
	dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
	{
	zio_t *zio;
	dmu_tx_t *tx;
	dsl_dir_t *dd;
	dsl_dataset_t *ds;
	objset_t *mos = dp->dp_meta_objset;
	list_t synced_datasets;

	list_create(&synced_datasets, sizeof (dsl_dataset_t),
	offsetof(dsl_dataset_t, ds_synced_link));

	tx = dmu_tx_create_assigned(dp, txg);

	/*
	* Write out all dirty blocks of dirty datasets.
	*/
	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
	/*
	* We must not sync any non-MOS datasets twice, because
	* we may have taken a snapshot of them. However, we
	* may sync newly-created datasets on pass 2.
	*/
	ASSERT(!list_link_active(&ds->ds_synced_link));
	list_insert_tail(&synced_datasets, ds);
	dsl_dataset_sync(ds, zio, tx);
	}
	VERIFY0(zio_wait(zio));

	/*
	* We have written all of the accounted dirty data, so our
	* dp_space_towrite should now be zero. However, some seldom-used
	* code paths do not adhere to this (e.g. dbuf_undirty(), also
	* rounding error in dbuf_write_physdone).
	* Shore up the accounting of any dirtied space now.
	*/
	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);

	/*
	* After the data blocks have been written (ensured by the zio_wait()
	* above), update the user/group space accounting.
	*/
	for (ds = list_head(&synced_datasets); ds != NULL;
	ds = list_next(&synced_datasets, ds)) {
	dmu_objset_do_userquota_updates(ds->ds_objset, tx);
	}

	/*
	* Sync the datasets again to push out the changes due to
	* userspace updates. This must be done before we process the
	* sync tasks, so that any snapshots will have the correct
	* user accounting information (and we won't get confused
	* about which blocks are part of the snapshot).
	*/
	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
	ASSERT(list_link_active(&ds->ds_synced_link));
	dmu_buf_rele(ds->ds_dbuf, ds);
	dsl_dataset_sync(ds, zio, tx);
	}
	VERIFY0(zio_wait(zio));

	/*
	* Now that the datasets have been completely synced, we can
	* clean up our in-memory structures accumulated while syncing:
	*
	* - move dead blocks from the pending deadlist to the on-disk deadlist
	* - release hold from dsl_dataset_dirty()
	*/
	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
	objset_t *os = ds->ds_objset;
	bplist_iterate(&ds->ds_pending_deadlist,
	deadlist_enqueue_cb, &ds->ds_deadlist, tx);
	ASSERT(!dmu_objset_is_dirty(os, txg));
	dmu_buf_rele(ds->ds_dbuf, ds);
	}
	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
	dsl_dir_sync(dd, tx);
	}

	/*
	* The MOS's space is accounted for in the pool/$MOS
	* (dp_mos_dir). We can't modify the mos while we're syncing
	* it, so we remember the deltas and apply them here.
	*/
	if (dp->dp_mos_used_delta != 0 \|\| dp->dp_mos_compressed_delta != 0 \|\|
	dp->dp_mos_uncompressed_delta != 0) {
	dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
	dp->dp_mos_used_delta,
	dp->dp_mos_compressed_delta,
	dp->dp_mos_uncompressed_delta, tx);
	dp->dp_mos_used_delta = 0;
	dp->dp_mos_compressed_delta = 0;
	dp->dp_mos_uncompressed_delta = 0;
	}

	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL \|\|
	list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
	dsl_pool_sync_mos(dp, tx);
	}

	/*
	* If we modify a dataset in the same txg that we want to destroy it,
	* its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
	* dsl_dir_destroy_check() will fail if there are unexpected holds.
	* Therefore, we want to sync the MOS (thus syncing the dd_dbuf
	* and clearing the hold on it) before we process the sync_tasks.
	* The MOS data dirtied by the sync_tasks will be synced on the next
	* pass.
	*/
	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
	dsl_sync_task_t *dst;
	/*
	* No more sync tasks should have been added while we
	* were syncing.
	*/
	ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
	while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
	dsl_sync_task_sync(dst, tx);
	}

	dmu_tx_commit(tx);

	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
	}

	void
	dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
	{
	zilog_t *zilog;

	while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
	zil_clean(zilog, txg);
	ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
	dmu_buf_rele(ds->ds_dbuf, zilog);
	}
	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
	}

	/*
	* TRUE if the current thread is the tx_sync_thread or if we
	* are being called from SPA context during pool initialization.
	*/
	int
	dsl_pool_sync_context(dsl_pool_t *dp)
	{
	return (curthread == dp->dp_tx.tx_sync_thread \|\|
	spa_is_initializing(dp->dp_spa));
	}

	uint64_t
	dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
	{
	uint64_t space, resv;

	/*
	* If we're trying to assess whether it's OK to do a free,
	* cut the reservation in half to allow forward progress
	* (e.g. make it possible to rm(1) files from a full pool).
	*/
	space = spa_get_dspace(dp->dp_spa);
	resv = spa_get_slop_space(dp->dp_spa);
	if (netfree)
	resv >>= 1;

	return (space - resv);
	}

	boolean_t
	dsl_pool_need_dirty_delay(dsl_pool_t *dp)
	{
	uint64_t delay_min_bytes =
	zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
	boolean_t rv;

	mutex_enter(&dp->dp_lock);
	if (dp->dp_dirty_total > zfs_dirty_data_sync)
	txg_kick(dp);
	rv = (dp->dp_dirty_total > delay_min_bytes);
	mutex_exit(&dp->dp_lock);
	return (rv);
	}

	void
	dsl_pool_dirty_space(dsl_pool_t dp, int64_t space, dmu_tx_t tx)
	{
	if (space > 0) {
	mutex_enter(&dp->dp_lock);
	dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
	dsl_pool_dirty_delta(dp, space);
	mutex_exit(&dp->dp_lock);
	}
	}

	void
	dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
	{
	ASSERT3S(space, >=, 0);
	if (space == 0)
	return;
	mutex_enter(&dp->dp_lock);
	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
	/* XXX writing something we didn't dirty? */
	space = dp->dp_dirty_pertxg[txg & TXG_MASK];
	}
	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
	ASSERT3U(dp->dp_dirty_total, >=, space);
	dsl_pool_dirty_delta(dp, -space);
	mutex_exit(&dp->dp_lock);
	}

	/* ARGSUSED */
	static int
	upgrade_clones_cb(dsl_pool_t dp, dsl_dataset_t hds, void *arg)
	{
	dmu_tx_t *tx = arg;
	dsl_dataset_t ds, prev = NULL;
	int err;

	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
	if (err)
	return (err);

	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
	err = dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
	if (err) {
	dsl_dataset_rele(ds, FTAG);
	return (err);
	}

	if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
	break;
	dsl_dataset_rele(ds, FTAG);
	ds = prev;
	prev = NULL;
	}

	if (prev == NULL) {
	prev = dp->dp_origin_snap;

	/*
	* The $ORIGIN can't have any data, or the accounting
	* will be wrong.
	*/
	ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);

	/* The origin doesn't get attached to itself */
	if (ds->ds_object == prev->ds_object) {
	dsl_dataset_rele(ds, FTAG);
	return (0);
	}

	dmu_buf_will_dirty(ds->ds_dbuf, tx);
	dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
	dsl_dataset_phys(ds)->ds_prev_snap_txg =
	dsl_dataset_phys(prev)->ds_creation_txg;

	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
	dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;

	dmu_buf_will_dirty(prev->ds_dbuf, tx);
	dsl_dataset_phys(prev)->ds_num_children++;

	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
	ASSERT(ds->ds_prev == NULL);
	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dataset_phys(ds)->ds_prev_snap_obj,
	ds, &ds->ds_prev));
	}
	}

	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);

	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
	dmu_buf_will_dirty(prev->ds_dbuf, tx);
	dsl_dataset_phys(prev)->ds_next_clones_obj =
	zap_create(dp->dp_meta_objset,
	DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
	}
	VERIFY0(zap_add_int(dp->dp_meta_objset,
	dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));

	dsl_dataset_rele(ds, FTAG);
	if (prev != dp->dp_origin_snap)
	dsl_dataset_rele(prev, FTAG);
	return (0);
	}

	void
	dsl_pool_upgrade_clones(dsl_pool_t dp, dmu_tx_t tx)
	{
	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(dp->dp_origin_snap != NULL);

	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
	- tx, DS_FIND_CHILDREN));
	+ tx, DS_FIND_CHILDREN \| DS_FIND_SERIALIZE));
	}

	/* ARGSUSED */
	static int
	upgrade_dir_clones_cb(dsl_pool_t dp, dsl_dataset_t ds, void *arg)
	{
	dmu_tx_t *tx = arg;
	objset_t *mos = dp->dp_meta_objset;

	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
	dsl_dataset_t *origin;

	VERIFY0(dsl_dataset_hold_obj(dp,
	dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));

	if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
	dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
	dsl_dir_phys(origin->ds_dir)->dd_clones =
	zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
	0, tx);
	}

	VERIFY0(zap_add_int(dp->dp_meta_objset,
	dsl_dir_phys(origin->ds_dir)->dd_clones,
	ds->ds_object, tx));

	dsl_dataset_rele(origin, FTAG);
	}
	return (0);
	}

	void
	dsl_pool_upgrade_dir_clones(dsl_pool_t dp, dmu_tx_t tx)
	{
	ASSERT(dmu_tx_is_syncing(tx));
	uint64_t obj;

	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
	VERIFY0(dsl_pool_open_special_dir(dp,
	FREE_DIR_NAME, &dp->dp_free_dir));

	/*
	* We can't use bpobj_alloc(), because spa_version() still
	* returns the old version, and we need a new-version bpobj with
	* subobj support. So call dmu_object_alloc() directly.
	*/
	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
	SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));

	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	- upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
	+ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN \| DS_FIND_SERIALIZE));
	}

	void
	dsl_pool_create_origin(dsl_pool_t dp, dmu_tx_t tx)
	{
	uint64_t dsobj;
	dsl_dataset_t *ds;

	ASSERT(dmu_tx_is_syncing(tx));
	ASSERT(dp->dp_origin_snap == NULL);
	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));

	/* create the origin dir, ds, & snap-ds */
	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
	NULL, 0, kcred, tx);
	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
	dp, &dp->dp_origin_snap));
	dsl_dataset_rele(ds, FTAG);
	}

	taskq_t *
	dsl_pool_vnrele_taskq(dsl_pool_t *dp)
	{
	return (dp->dp_vnrele_taskq);
	}

	/*
	* Walk through the pool-wide zap object of temporary snapshot user holds
	* and release them.
	*/
	void
	dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
	{
	zap_attribute_t za;
	zap_cursor_t zc;
	objset_t *mos = dp->dp_meta_objset;
	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
	nvlist_t *holds;

	if (zapobj == 0)
	return;
	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);

	holds = fnvlist_alloc();

	for (zap_cursor_init(&zc, mos, zapobj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	char *htag;
	nvlist_t *tags;

	htag = strchr(za.za_name, '-');
	*htag = '\0';
	++htag;
	if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
	tags = fnvlist_alloc();
	fnvlist_add_boolean(tags, htag);
	fnvlist_add_nvlist(holds, za.za_name, tags);
	fnvlist_free(tags);
	} else {
	fnvlist_add_boolean(tags, htag);
	}
	}
	dsl_dataset_user_release_tmp(dp, holds);
	fnvlist_free(holds);
	zap_cursor_fini(&zc);
	}

	/*
	* Create the pool-wide zap object for storing temporary snapshot holds.
	*/
	void
	dsl_pool_user_hold_create_obj(dsl_pool_t dp, dmu_tx_t tx)
	{
	objset_t *mos = dp->dp_meta_objset;

	ASSERT(dp->dp_tmp_userrefs_obj == 0);
	ASSERT(dmu_tx_is_syncing(tx));

	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
	}

	static int
	dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
	const char tag, uint64_t now, dmu_tx_t tx, boolean_t holding)
	{
	objset_t *mos = dp->dp_meta_objset;
	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
	char *name;
	int error;

	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
	ASSERT(dmu_tx_is_syncing(tx));

	/*
	* If the pool was created prior to SPA_VERSION_USERREFS, the
	* zap object for temporary holds might not exist yet.
	*/
	if (zapobj == 0) {
	if (holding) {
	dsl_pool_user_hold_create_obj(dp, tx);
	zapobj = dp->dp_tmp_userrefs_obj;
	} else {
	return (SET_ERROR(ENOENT));
	}
	}

	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
	if (holding)
	error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
	else
	error = zap_remove(mos, zapobj, name, tx);
	strfree(name);

	return (error);
	}

	/*
	* Add a temporary hold for the given dataset object and tag.
	*/
	int
	dsl_pool_user_hold(dsl_pool_t dp, uint64_t dsobj, const char tag,
	uint64_t now, dmu_tx_t *tx)
	{
	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
	}

	/*
	* Release a temporary hold for the given dataset object and tag.
	*/
	int
	dsl_pool_user_release(dsl_pool_t dp, uint64_t dsobj, const char tag,
	dmu_tx_t *tx)
	{
	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
	tx, B_FALSE));
	}

	/*
	* DSL Pool Configuration Lock
	*
	* The dp_config_rwlock protects against changes to DSL state (e.g. dataset
	* creation / destruction / rename / property setting). It must be held for
	* read to hold a dataset or dsl_dir. I.e. you must call
	* dsl_pool_config_enter() or dsl_pool_hold() before calling
	* dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock
	* must be held continuously until all datasets and dsl_dirs are released.
	*
	* The only exception to this rule is that if a "long hold" is placed on
	* a dataset, then the dp_config_rwlock may be dropped while the dataset
	* is still held. The long hold will prevent the dataset from being
	* destroyed -- the destroy will fail with EBUSY. A long hold can be
	* obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
	* (by calling dsl_{dataset,objset}_{try}own{_obj}).
	*
	* Legitimate long-holders (including owners) should be long-running, cancelable
	* tasks that should cause "zfs destroy" to fail. This includes DMU
	* consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
	* "zfs send", and "zfs diff". There are several other long-holders whose
	* uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
	*
	* The usual formula for long-holding would be:
	* dsl_pool_hold()
	* dsl_dataset_hold()
	* ... perform checks ...
	* dsl_dataset_long_hold()
	* dsl_pool_rele()
	* ... perform long-running task ...
	* dsl_dataset_long_rele()
	* dsl_dataset_rele()
	*
	* Note that when the long hold is released, the dataset is still held but
	* the pool is not held. The dataset may change arbitrarily during this time
	* (e.g. it could be destroyed). Therefore you shouldn't do anything to the
	* dataset except release it.
	*
	* User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
	* or modifying operations.
	*
	* Modifying operations should generally use dsl_sync_task(). The synctask
	* infrastructure enforces proper locking strategy with respect to the
	* dp_config_rwlock. See the comment above dsl_sync_task() for details.
	*
	* Read-only operations will manually hold the pool, then the dataset, obtain
	* information from the dataset, then release the pool and dataset.
	* dmu_objset_{hold,rele}() are convenience routines that also do the pool
	* hold/rele.
	*/

	int
	dsl_pool_hold(const char name, void tag, dsl_pool_t **dp)
	{
	spa_t *spa;
	int error;

	error = spa_open(name, &spa, tag);
	if (error == 0) {
	*dp = spa_get_dsl(spa);
	dsl_pool_config_enter(*dp, tag);
	}
	return (error);
	}

	void
	dsl_pool_rele(dsl_pool_t dp, void tag)
	{
	dsl_pool_config_exit(dp, tag);
	spa_close(dp->dp_spa, tag);
	}

	void
	dsl_pool_config_enter(dsl_pool_t dp, void tag)
	{
	/*
	* We use a "reentrant" reader-writer lock, but not reentrantly.
	*
	* The rrwlock can (with the track_all flag) track all reading threads,
	* which is very useful for debugging which code path failed to release
	* the lock, and for verifying that the current thread does hold
	* the lock.
	*
	* (Unlike a rwlock, which knows that N threads hold it for
	* read, but not which threads, so rw_held(RW_READER) returns TRUE
	* if any thread holds it for read, even if this thread doesn't).
	*/
	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
	}

	void
	dsl_pool_config_exit(dsl_pool_t dp, void tag)
	{
	rrw_exit(&dp->dp_config_rwlock, tag);
	}

	boolean_t
	dsl_pool_config_held(dsl_pool_t *dp)
	{
	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
	+}
	+
	+boolean_t
	+dsl_pool_config_held_writer(dsl_pool_t *dp)
	+{
	+ return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
	}
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c (revision 284762)
	@@ -1,6664 +1,6665 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
	* Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	*/

	/*
	* SPA: Storage Pool Allocator
	*
	* This file contains all the routines used when modifying on-disk SPA state.
	* This includes opening, importing, destroying, exporting a pool, and syncing a
	* pool.
	*/

	#include <sys/zfs_context.h>
	#include <sys/fm/fs/zfs.h>
	#include <sys/spa_impl.h>
	#include <sys/zio.h>
	#include <sys/zio_checksum.h>
	#include <sys/dmu.h>
	#include <sys/dmu_tx.h>
	#include <sys/zap.h>
	#include <sys/zil.h>
	#include <sys/ddt.h>
	#include <sys/vdev_impl.h>
	#include <sys/metaslab.h>
	#include <sys/metaslab_impl.h>
	#include <sys/uberblock_impl.h>
	#include <sys/txg.h>
	#include <sys/avl.h>
	#include <sys/dmu_traverse.h>
	#include <sys/dmu_objset.h>
	#include <sys/unique.h>
	#include <sys/dsl_pool.h>
	#include <sys/dsl_dataset.h>
	#include <sys/dsl_dir.h>
	#include <sys/dsl_prop.h>
	#include <sys/dsl_synctask.h>
	#include <sys/fs/zfs.h>
	#include <sys/arc.h>
	#include <sys/callb.h>
	#include <sys/systeminfo.h>
	#include <sys/spa_boot.h>
	#include <sys/zfs_ioctl.h>
	#include <sys/dsl_scan.h>
	#include <sys/zfeature.h>
	#include <sys/dsl_destroy.h>

	#ifdef _KERNEL
	#include <sys/bootprops.h>
	#include <sys/callb.h>
	#include <sys/cpupart.h>
	#include <sys/pool.h>
	#include <sys/sysdc.h>
	#include <sys/zone.h>
	#endif /* _KERNEL */

	#include "zfs_prop.h"
	#include "zfs_comutil.h"

	/*
	* The interval, in seconds, at which failed configuration cache file writes
	* should be retried.
	*/
	static int zfs_ccw_retry_interval = 300;

	typedef enum zti_modes {
	ZTI_MODE_FIXED, /* value is # of threads (min 1) */
	ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
	ZTI_MODE_NULL, /* don't create a taskq */
	ZTI_NMODES
	} zti_modes_t;

	#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
	#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
	#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }

	#define ZTI_N(n) ZTI_P(n, 1)
	#define ZTI_ONE ZTI_N(1)

	typedef struct zio_taskq_info {
	zti_modes_t zti_mode;
	uint_t zti_value;
	uint_t zti_count;
	} zio_taskq_info_t;

	static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
	"issue", "issue_high", "intr", "intr_high"
	};

	/*
	* This table defines the taskq settings for each ZFS I/O type. When
	* initializing a pool, we use this table to create an appropriately sized
	* taskq. Some operations are low volume and therefore have a small, static
	* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
	* macros. Other operations process a large amount of data; the ZTI_BATCH
	* macro causes us to create a taskq oriented for throughput. Some operations
	* are so high frequency and short-lived that the taskq itself can become a a
	* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
	* additional degree of parallelism specified by the number of threads per-
	* taskq and the number of taskqs; when dispatching an event in this case, the
	* particular taskq is chosen at random.
	*
	* The different taskq priorities are to handle the different contexts (issue
	* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
	* need to be handled with minimum delay.
	*/
	const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
	/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
	{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
	{ ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
	{ ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */
	{ ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
	{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
	{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
	};

	static void spa_sync_version(void arg, dmu_tx_t tx);
	static void spa_sync_props(void arg, dmu_tx_t tx);
	static boolean_t spa_has_active_shared_spare(spa_t *spa);
	static int spa_load_impl(spa_t spa, uint64_t, nvlist_t config,
	spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
	char **ereport);
	static void spa_vdev_resilver_done(spa_t *spa);

	uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
	id_t zio_taskq_psrset_bind = PS_NONE;
	boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
	uint_t zio_taskq_basedc = 80; /* base duty cycle */

	boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
	extern int zfs_sync_pass_deferred_free;

	/*
	* This (illegal) pool name is used when temporarily importing a spa_t in order
	* to get the vdev stats associated with the imported devices.
	*/
	#define TRYIMPORT_NAME "$import"

	/*
	* ==========================================================================
	* SPA properties routines
	* ==========================================================================
	*/

	/*
	* Add a (source=src, propname=propval) list to an nvlist.
	*/
	static void
	spa_prop_add_list(nvlist_t nvl, zpool_prop_t prop, char strval,
	uint64_t intval, zprop_source_t src)
	{
	const char *propname = zpool_prop_to_name(prop);
	nvlist_t *propval;

	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);

	if (strval != NULL)
	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
	else
	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);

	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
	nvlist_free(propval);
	}

	/*
	* Get property values from the spa configuration.
	*/
	static void
	spa_prop_get_config(spa_t spa, nvlist_t *nvp)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	dsl_pool_t *pool = spa->spa_dsl_pool;
	uint64_t size, alloc, cap, version;
	zprop_source_t src = ZPROP_SRC_NONE;
	spa_config_dirent_t *dp;
	metaslab_class_t *mc = spa_normal_class(spa);

	ASSERT(MUTEX_HELD(&spa->spa_props_lock));

	if (rvd != NULL) {
	alloc = metaslab_class_get_alloc(spa_normal_class(spa));
	size = metaslab_class_get_space(spa_normal_class(spa));
	spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
	size - alloc, src);

	spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
	metaslab_class_fragmentation(mc), src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
	metaslab_class_expandable_space(mc), src);
	spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
	(spa_mode(spa) == FREAD), src);

	cap = (size == 0) ? 0 : (alloc * 100 / size);
	spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);

	spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
	ddt_get_pool_dedup_ratio(spa), src);

	spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
	rvd->vdev_state, src);

	version = spa_version(spa);
	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
	src = ZPROP_SRC_DEFAULT;
	else
	src = ZPROP_SRC_LOCAL;
	spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
	}

	if (pool != NULL) {
	/*
	* The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
	* when opening pools before this version freedir will be NULL.
	*/
	if (pool->dp_free_dir != NULL) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
	dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
	src);
	} else {
	spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
	NULL, 0, src);
	}

	if (pool->dp_leak_dir != NULL) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
	dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
	src);
	} else {
	spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
	NULL, 0, src);
	}
	}

	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);

	if (spa->spa_comment != NULL) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
	0, ZPROP_SRC_LOCAL);
	}

	if (spa->spa_root != NULL)
	spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
	0, ZPROP_SRC_LOCAL);

	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
	MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
	} else {
	spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
	SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
	}

	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
	if (dp->scd_path == NULL) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
	"none", 0, ZPROP_SRC_LOCAL);
	} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
	spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
	dp->scd_path, 0, ZPROP_SRC_LOCAL);
	}
	}
	}

	/*
	* Get zpool property values.
	*/
	int
	spa_prop_get(spa_t spa, nvlist_t *nvp)
	{
	objset_t *mos = spa->spa_meta_objset;
	zap_cursor_t zc;
	zap_attribute_t za;
	int err;

	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	mutex_enter(&spa->spa_props_lock);

	/*
	* Get properties from the spa config.
	*/
	spa_prop_get_config(spa, nvp);

	/* If no pool property object, no more prop to get. */
	if (mos == NULL \|\| spa->spa_pool_props_object == 0) {
	mutex_exit(&spa->spa_props_lock);
	return (0);
	}

	/*
	* Get properties from the MOS pool property object.
	*/
	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
	(err = zap_cursor_retrieve(&zc, &za)) == 0;
	zap_cursor_advance(&zc)) {
	uint64_t intval = 0;
	char *strval = NULL;
	zprop_source_t src = ZPROP_SRC_DEFAULT;
	zpool_prop_t prop;

	if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
	continue;

	switch (za.za_integer_length) {
	case 8:
	/* integer property */
	if (za.za_first_integer !=
	zpool_prop_default_numeric(prop))
	src = ZPROP_SRC_LOCAL;

	if (prop == ZPOOL_PROP_BOOTFS) {
	dsl_pool_t *dp;
	dsl_dataset_t *ds = NULL;

	dp = spa_get_dsl(spa);
	dsl_pool_config_enter(dp, FTAG);
	if (err = dsl_dataset_hold_obj(dp,
	za.za_first_integer, FTAG, &ds)) {
	dsl_pool_config_exit(dp, FTAG);
	break;
	}

	strval = kmem_alloc(
	MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
	KM_SLEEP);
	dsl_dataset_name(ds, strval);
	dsl_dataset_rele(ds, FTAG);
	dsl_pool_config_exit(dp, FTAG);
	} else {
	strval = NULL;
	intval = za.za_first_integer;
	}

	spa_prop_add_list(*nvp, prop, strval, intval, src);

	if (strval != NULL)
	kmem_free(strval,
	MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);

	break;

	case 1:
	/* string property */
	strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
	err = zap_lookup(mos, spa->spa_pool_props_object,
	za.za_name, 1, za.za_num_integers, strval);
	if (err) {
	kmem_free(strval, za.za_num_integers);
	break;
	}
	spa_prop_add_list(*nvp, prop, strval, 0, src);
	kmem_free(strval, za.za_num_integers);
	break;

	default:
	break;
	}
	}
	zap_cursor_fini(&zc);
	mutex_exit(&spa->spa_props_lock);
	out:
	if (err && err != ENOENT) {
	nvlist_free(*nvp);
	*nvp = NULL;
	return (err);
	}

	return (0);
	}

	/*
	* Validate the given pool properties nvlist and modify the list
	* for the property values to be set.
	*/
	static int
	spa_prop_validate(spa_t spa, nvlist_t props)
	{
	nvpair_t *elem;
	int error = 0, reset_bootfs = 0;
	uint64_t objnum = 0;
	boolean_t has_feature = B_FALSE;

	elem = NULL;
	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
	uint64_t intval;
	char strval, slash, check, fname;
	const char *propname = nvpair_name(elem);
	zpool_prop_t prop = zpool_name_to_prop(propname);

	switch (prop) {
	case ZPROP_INVAL:
	if (!zpool_prop_feature(propname)) {
	error = SET_ERROR(EINVAL);
	break;
	}

	/*
	* Sanitize the input.
	*/
	if (nvpair_type(elem) != DATA_TYPE_UINT64) {
	error = SET_ERROR(EINVAL);
	break;
	}

	if (nvpair_value_uint64(elem, &intval) != 0) {
	error = SET_ERROR(EINVAL);
	break;
	}

	if (intval != 0) {
	error = SET_ERROR(EINVAL);
	break;
	}

	fname = strchr(propname, '@') + 1;
	if (zfeature_lookup_name(fname, NULL) != 0) {
	error = SET_ERROR(EINVAL);
	break;
	}

	has_feature = B_TRUE;
	break;

	case ZPOOL_PROP_VERSION:
	error = nvpair_value_uint64(elem, &intval);
	if (!error &&
	(intval < spa_version(spa) \|\|
	intval > SPA_VERSION_BEFORE_FEATURES \|\|
	has_feature))
	error = SET_ERROR(EINVAL);
	break;

	case ZPOOL_PROP_DELEGATION:
	case ZPOOL_PROP_AUTOREPLACE:
	case ZPOOL_PROP_LISTSNAPS:
	case ZPOOL_PROP_AUTOEXPAND:
	error = nvpair_value_uint64(elem, &intval);
	if (!error && intval > 1)
	error = SET_ERROR(EINVAL);
	break;

	case ZPOOL_PROP_BOOTFS:
	/*
	* If the pool version is less than SPA_VERSION_BOOTFS,
	* or the pool is still being created (version == 0),
	* the bootfs property cannot be set.
	*/
	if (spa_version(spa) < SPA_VERSION_BOOTFS) {
	error = SET_ERROR(ENOTSUP);
	break;
	}

	/*
	* Make sure the vdev config is bootable
	*/
	if (!vdev_is_bootable(spa->spa_root_vdev)) {
	error = SET_ERROR(ENOTSUP);
	break;
	}

	reset_bootfs = 1;

	error = nvpair_value_string(elem, &strval);

	if (!error) {
	objset_t *os;
	uint64_t propval;

	if (strval == NULL \|\| strval[0] == '\0') {
	objnum = zpool_prop_default_numeric(
	ZPOOL_PROP_BOOTFS);
	break;
	}

	if (error = dmu_objset_hold(strval, FTAG, &os))
	break;

	/*
	* Must be ZPL, and its property settings
	* must be supported by GRUB (compression
	* is not gzip, and large blocks are not used).
	*/

	if (dmu_objset_type(os) != DMU_OST_ZFS) {
	error = SET_ERROR(ENOTSUP);
	} else if ((error =
	dsl_prop_get_int_ds(dmu_objset_ds(os),
	zfs_prop_to_name(ZFS_PROP_COMPRESSION),
	&propval)) == 0 &&
	!BOOTFS_COMPRESS_VALID(propval)) {
	error = SET_ERROR(ENOTSUP);
	} else if ((error =
	dsl_prop_get_int_ds(dmu_objset_ds(os),
	zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
	&propval)) == 0 &&
	propval > SPA_OLD_MAXBLOCKSIZE) {
	error = SET_ERROR(ENOTSUP);
	} else {
	objnum = dmu_objset_id(os);
	}
	dmu_objset_rele(os, FTAG);
	}
	break;

	case ZPOOL_PROP_FAILUREMODE:
	error = nvpair_value_uint64(elem, &intval);
	if (!error && (intval < ZIO_FAILURE_MODE_WAIT \|\|
	intval > ZIO_FAILURE_MODE_PANIC))
	error = SET_ERROR(EINVAL);

	/*
	* This is a special case which only occurs when
	* the pool has completely failed. This allows
	* the user to change the in-core failmode property
	* without syncing it out to disk (I/Os might
	* currently be blocked). We do this by returning
	* EIO to the caller (spa_prop_set) to trick it
	* into thinking we encountered a property validation
	* error.
	*/
	if (!error && spa_suspended(spa)) {
	spa->spa_failmode = intval;
	error = SET_ERROR(EIO);
	}
	break;

	case ZPOOL_PROP_CACHEFILE:
	if ((error = nvpair_value_string(elem, &strval)) != 0)
	break;

	if (strval[0] == '\0')
	break;

	if (strcmp(strval, "none") == 0)
	break;

	if (strval[0] != '/') {
	error = SET_ERROR(EINVAL);
	break;
	}

	slash = strrchr(strval, '/');
	ASSERT(slash != NULL);

	if (slash[1] == '\0' \|\| strcmp(slash, "/.") == 0 \|\|
	strcmp(slash, "/..") == 0)
	error = SET_ERROR(EINVAL);
	break;

	case ZPOOL_PROP_COMMENT:
	if ((error = nvpair_value_string(elem, &strval)) != 0)
	break;
	for (check = strval; *check != '\0'; check++) {
	/*
	* The kernel doesn't have an easy isprint()
	* check. For this kernel check, we merely
	* check ASCII apart from DEL. Fix this if
	* there is an easy-to-use kernel isprint().
	*/
	if (*check >= 0x7f) {
	error = SET_ERROR(EINVAL);
	break;
	}
	check++;
	}
	if (strlen(strval) > ZPROP_MAX_COMMENT)
	error = E2BIG;
	break;

	case ZPOOL_PROP_DEDUPDITTO:
	if (spa_version(spa) < SPA_VERSION_DEDUP)
	error = SET_ERROR(ENOTSUP);
	else
	error = nvpair_value_uint64(elem, &intval);
	if (error == 0 &&
	intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
	error = SET_ERROR(EINVAL);
	break;
	}

	if (error)
	break;
	}

	if (!error && reset_bootfs) {
	error = nvlist_remove(props,
	zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);

	if (!error) {
	error = nvlist_add_uint64(props,
	zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
	}
	}

	return (error);
	}

	void
	spa_configfile_set(spa_t spa, nvlist_t nvp, boolean_t need_sync)
	{
	char *cachefile;
	spa_config_dirent_t *dp;

	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
	&cachefile) != 0)
	return;

	dp = kmem_alloc(sizeof (spa_config_dirent_t),
	KM_SLEEP);

	if (cachefile[0] == '\0')
	dp->scd_path = spa_strdup(spa_config_path);
	else if (strcmp(cachefile, "none") == 0)
	dp->scd_path = NULL;
	else
	dp->scd_path = spa_strdup(cachefile);

	list_insert_head(&spa->spa_config_list, dp);
	if (need_sync)
	spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
	}

	int
	spa_prop_set(spa_t spa, nvlist_t nvp)
	{
	int error;
	nvpair_t *elem = NULL;
	boolean_t need_sync = B_FALSE;

	if ((error = spa_prop_validate(spa, nvp)) != 0)
	return (error);

	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
	zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));

	if (prop == ZPOOL_PROP_CACHEFILE \|\|
	prop == ZPOOL_PROP_ALTROOT \|\|
	prop == ZPOOL_PROP_READONLY)
	continue;

	if (prop == ZPOOL_PROP_VERSION \|\| prop == ZPROP_INVAL) {
	uint64_t ver;

	if (prop == ZPOOL_PROP_VERSION) {
	VERIFY(nvpair_value_uint64(elem, &ver) == 0);
	} else {
	ASSERT(zpool_prop_feature(nvpair_name(elem)));
	ver = SPA_VERSION_FEATURES;
	need_sync = B_TRUE;
	}

	/* Save time if the version is already set. */
	if (ver == spa_version(spa))
	continue;

	/*
	* In addition to the pool directory object, we might
	* create the pool properties object, the features for
	* read object, the features for write object, or the
	* feature descriptions object.
	*/
	error = dsl_sync_task(spa->spa_name, NULL,
	spa_sync_version, &ver,
	6, ZFS_SPACE_CHECK_RESERVED);
	if (error)
	return (error);
	continue;
	}

	need_sync = B_TRUE;
	break;
	}

	if (need_sync) {
	return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
	nvp, 6, ZFS_SPACE_CHECK_RESERVED));
	}

	return (0);
	}

	/*
	* If the bootfs property value is dsobj, clear it.
	*/
	void
	spa_prop_clear_bootfs(spa_t spa, uint64_t dsobj, dmu_tx_t tx)
	{
	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
	VERIFY(zap_remove(spa->spa_meta_objset,
	spa->spa_pool_props_object,
	zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
	spa->spa_bootfs = 0;
	}
	}

	/ARGSUSED/
	static int
	spa_change_guid_check(void arg, dmu_tx_t tx)
	{
	uint64_t *newguid = arg;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	uint64_t vdev_state;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	vdev_state = rvd->vdev_state;
	spa_config_exit(spa, SCL_STATE, FTAG);

	if (vdev_state != VDEV_STATE_HEALTHY)
	return (SET_ERROR(ENXIO));

	ASSERT3U(spa_guid(spa), !=, *newguid);

	return (0);
	}

	static void
	spa_change_guid_sync(void arg, dmu_tx_t tx)
	{
	uint64_t *newguid = arg;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	uint64_t oldguid;
	vdev_t *rvd = spa->spa_root_vdev;

	oldguid = spa_guid(spa);

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	rvd->vdev_guid = *newguid;
	rvd->vdev_guid_sum += (*newguid - oldguid);
	vdev_config_dirty(rvd);
	spa_config_exit(spa, SCL_STATE, FTAG);

	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
	oldguid, *newguid);
	}

	/*
	* Change the GUID for the pool. This is done so that we can later
	* re-import a pool built from a clone of our own vdevs. We will modify
	* the root vdev's guid, our own pool guid, and then mark all of our
	* vdevs dirty. Note that we must make sure that all our vdevs are
	* online when we do this, or else any vdevs that weren't present
	* would be orphaned from our pool. We are also going to issue a
	* sysevent to update any watchers.
	*/
	int
	spa_change_guid(spa_t *spa)
	{
	int error;
	uint64_t guid;

	mutex_enter(&spa->spa_vdev_top_lock);
	mutex_enter(&spa_namespace_lock);
	guid = spa_generate_guid(NULL);

	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
	spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);

	if (error == 0) {
	spa_config_sync(spa, B_FALSE, B_TRUE);
	spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
	}

	mutex_exit(&spa_namespace_lock);
	mutex_exit(&spa->spa_vdev_top_lock);

	return (error);
	}

	/*
	* ==========================================================================
	* SPA state manipulation (open/create/destroy/import/export)
	* ==========================================================================
	*/

	static int
	spa_error_entry_compare(const void a, const void b)
	{
	spa_error_entry_t sa = (spa_error_entry_t )a;
	spa_error_entry_t sb = (spa_error_entry_t )b;
	int ret;

	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
	sizeof (zbookmark_phys_t));

	if (ret < 0)
	return (-1);
	else if (ret > 0)
	return (1);
	else
	return (0);
	}

	/*
	* Utility function which retrieves copies of the current logs and
	* re-initializes them in the process.
	*/
	void
	spa_get_errlists(spa_t spa, avl_tree_t last, avl_tree_t *scrub)
	{
	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));

	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));

	avl_create(&spa->spa_errlist_scrub,
	spa_error_entry_compare, sizeof (spa_error_entry_t),
	offsetof(spa_error_entry_t, se_avl));
	avl_create(&spa->spa_errlist_last,
	spa_error_entry_compare, sizeof (spa_error_entry_t),
	offsetof(spa_error_entry_t, se_avl));
	}

	static void
	spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
	{
	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
	enum zti_modes mode = ztip->zti_mode;
	uint_t value = ztip->zti_value;
	uint_t count = ztip->zti_count;
	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	char name[32];
	uint_t flags = 0;
	boolean_t batch = B_FALSE;

	if (mode == ZTI_MODE_NULL) {
	tqs->stqs_count = 0;
	tqs->stqs_taskq = NULL;
	return;
	}

	ASSERT3U(count, >, 0);

	tqs->stqs_count = count;
	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);

	switch (mode) {
	case ZTI_MODE_FIXED:
	ASSERT3U(value, >=, 1);
	value = MAX(value, 1);
	break;

	case ZTI_MODE_BATCH:
	batch = B_TRUE;
	flags \|= TASKQ_THREADS_CPU_PCT;
	value = zio_taskq_batch_pct;
	break;

	default:
	panic("unrecognized mode for %s_%s taskq (%u:%u) in "
	"spa_activate()",
	zio_type_name[t], zio_taskq_types[q], mode, value);
	break;
	}

	for (uint_t i = 0; i < count; i++) {
	taskq_t *tq;

	if (count > 1) {
	(void) snprintf(name, sizeof (name), "%s_%s_%u",
	zio_type_name[t], zio_taskq_types[q], i);
	} else {
	(void) snprintf(name, sizeof (name), "%s_%s",
	zio_type_name[t], zio_taskq_types[q]);
	}

	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
	if (batch)
	flags \|= TASKQ_DC_BATCH;

	tq = taskq_create_sysdc(name, value, 50, INT_MAX,
	spa->spa_proc, zio_taskq_basedc, flags);
	} else {
	pri_t pri = maxclsyspri;
	/*
	* The write issue taskq can be extremely CPU
	* intensive. Run it at slightly lower priority
	* than the other taskqs.
	*/
	if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
	pri--;

	tq = taskq_create_proc(name, value, pri, 50,
	INT_MAX, spa->spa_proc, flags);
	}

	tqs->stqs_taskq[i] = tq;
	}
	}

	static void
	spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
	{
	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];

	if (tqs->stqs_taskq == NULL) {
	ASSERT0(tqs->stqs_count);
	return;
	}

	for (uint_t i = 0; i < tqs->stqs_count; i++) {
	ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
	taskq_destroy(tqs->stqs_taskq[i]);
	}

	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
	tqs->stqs_taskq = NULL;
	}

	/*
	* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
	* Note that a type may have multiple discrete taskqs to avoid lock contention
	* on the taskq itself. In that case we choose which taskq at random by using
	* the low bits of gethrtime().
	*/
	void
	spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
	task_func_t func, void arg, uint_t flags, taskq_ent_t *ent)
	{
	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
	taskq_t *tq;

	ASSERT3P(tqs->stqs_taskq, !=, NULL);
	ASSERT3U(tqs->stqs_count, !=, 0);

	if (tqs->stqs_count == 1) {
	tq = tqs->stqs_taskq[0];
	} else {
	tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
	}

	taskq_dispatch_ent(tq, func, arg, flags, ent);
	}

	static void
	spa_create_zio_taskqs(spa_t *spa)
	{
	for (int t = 0; t < ZIO_TYPES; t++) {
	for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
	spa_taskqs_init(spa, t, q);
	}
	}
	}

	#ifdef _KERNEL
	static void
	spa_thread(void *arg)
	{
	callb_cpr_t cprinfo;

	spa_t *spa = arg;
	user_t *pu = PTOU(curproc);

	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
	spa->spa_name);

	ASSERT(curproc != &p0);
	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
	"zpool-%s", spa->spa_name);
	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));

	/* bind this thread to the requested psrset */
	if (zio_taskq_psrset_bind != PS_NONE) {
	pool_lock();
	mutex_enter(&cpu_lock);
	mutex_enter(&pidlock);
	mutex_enter(&curproc->p_lock);

	if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
	0, NULL, NULL) == 0) {
	curthread->t_bind_pset = zio_taskq_psrset_bind;
	} else {
	cmn_err(CE_WARN,
	"Couldn't bind process for zfs pool \"%s\" to "
	"pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
	}

	mutex_exit(&curproc->p_lock);
	mutex_exit(&pidlock);
	mutex_exit(&cpu_lock);
	pool_unlock();
	}

	if (zio_taskq_sysdc) {
	sysdc_thread_enter(curthread, 100, 0);
	}

	spa->spa_proc = curproc;
	spa->spa_did = curthread->t_did;

	spa_create_zio_taskqs(spa);

	mutex_enter(&spa->spa_proc_lock);
	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);

	spa->spa_proc_state = SPA_PROC_ACTIVE;
	cv_broadcast(&spa->spa_proc_cv);

	CALLB_CPR_SAFE_BEGIN(&cprinfo);
	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
	cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);

	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
	spa->spa_proc_state = SPA_PROC_GONE;
	spa->spa_proc = &p0;
	cv_broadcast(&spa->spa_proc_cv);
	CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */

	mutex_enter(&curproc->p_lock);
	lwp_exit();
	}
	#endif

	/*
	* Activate an uninitialized pool.
	*/
	static void
	spa_activate(spa_t *spa, int mode)
	{
	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);

	spa->spa_state = POOL_STATE_ACTIVE;
	spa->spa_mode = mode;

	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);

	/* Try to create a covering process */
	mutex_enter(&spa->spa_proc_lock);
	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
	ASSERT(spa->spa_proc == &p0);
	spa->spa_did = 0;

	/* Only create a process if we're going to be around a while. */
	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
	if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
	NULL, 0) == 0) {
	spa->spa_proc_state = SPA_PROC_CREATED;
	while (spa->spa_proc_state == SPA_PROC_CREATED) {
	cv_wait(&spa->spa_proc_cv,
	&spa->spa_proc_lock);
	}
	ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
	ASSERT(spa->spa_proc != &p0);
	ASSERT(spa->spa_did != 0);
	} else {
	#ifdef _KERNEL
	cmn_err(CE_WARN,
	"Couldn't create process for zfs pool \"%s\"\n",
	spa->spa_name);
	#endif
	}
	}
	mutex_exit(&spa->spa_proc_lock);

	/* If we didn't create a process, we need to create our taskqs. */
	if (spa->spa_proc == &p0) {
	spa_create_zio_taskqs(spa);
	}

	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
	offsetof(vdev_t, vdev_config_dirty_node));
	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
	offsetof(objset_t, os_evicting_node));
	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
	offsetof(vdev_t, vdev_state_dirty_node));

	txg_list_create(&spa->spa_vdev_txg_list,
	offsetof(struct vdev, vdev_txg_node));

	avl_create(&spa->spa_errlist_scrub,
	spa_error_entry_compare, sizeof (spa_error_entry_t),
	offsetof(spa_error_entry_t, se_avl));
	avl_create(&spa->spa_errlist_last,
	spa_error_entry_compare, sizeof (spa_error_entry_t),
	offsetof(spa_error_entry_t, se_avl));
	}

	/*
	* Opposite of spa_activate().
	*/
	static void
	spa_deactivate(spa_t *spa)
	{
	ASSERT(spa->spa_sync_on == B_FALSE);
	ASSERT(spa->spa_dsl_pool == NULL);
	ASSERT(spa->spa_root_vdev == NULL);
	ASSERT(spa->spa_async_zio_root == NULL);
	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);

	spa_evicting_os_wait(spa);

	txg_list_destroy(&spa->spa_vdev_txg_list);

	list_destroy(&spa->spa_config_dirty_list);
	list_destroy(&spa->spa_evicting_os_list);
	list_destroy(&spa->spa_state_dirty_list);

	for (int t = 0; t < ZIO_TYPES; t++) {
	for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
	spa_taskqs_fini(spa, t, q);
	}
	}

	metaslab_class_destroy(spa->spa_normal_class);
	spa->spa_normal_class = NULL;

	metaslab_class_destroy(spa->spa_log_class);
	spa->spa_log_class = NULL;

	/*
	* If this was part of an import or the open otherwise failed, we may
	* still have errors left in the queues. Empty them just in case.
	*/
	spa_errlog_drain(spa);

	avl_destroy(&spa->spa_errlist_scrub);
	avl_destroy(&spa->spa_errlist_last);

	spa->spa_state = POOL_STATE_UNINITIALIZED;

	mutex_enter(&spa->spa_proc_lock);
	if (spa->spa_proc_state != SPA_PROC_NONE) {
	ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
	spa->spa_proc_state = SPA_PROC_DEACTIVATE;
	cv_broadcast(&spa->spa_proc_cv);
	while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
	ASSERT(spa->spa_proc != &p0);
	cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
	}
	ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
	spa->spa_proc_state = SPA_PROC_NONE;
	}
	ASSERT(spa->spa_proc == &p0);
	mutex_exit(&spa->spa_proc_lock);

	/*
	* We want to make sure spa_thread() has actually exited the ZFS
	* module, so that the module can't be unloaded out from underneath
	* it.
	*/
	if (spa->spa_did != 0) {
	thread_join(spa->spa_did);
	spa->spa_did = 0;
	}
	}

	/*
	* Verify a pool configuration, and construct the vdev tree appropriately. This
	* will create all the necessary vdevs in the appropriate layout, with each vdev
	* in the CLOSED state. This will prep the pool before open/creation/import.
	* All vdev validation is done by the vdev_alloc() routine.
	*/
	static int
	spa_config_parse(spa_t spa, vdev_t vdp, nvlist_t nv, vdev_t *parent,
	uint_t id, int atype)
	{
	nvlist_t **child;
	uint_t children;
	int error;

	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
	return (error);

	if ((*vdp)->vdev_ops->vdev_op_leaf)
	return (0);

	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
	&child, &children);

	if (error == ENOENT)
	return (0);

	if (error) {
	vdev_free(*vdp);
	*vdp = NULL;
	return (SET_ERROR(EINVAL));
	}

	for (int c = 0; c < children; c++) {
	vdev_t *vd;
	if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
	atype)) != 0) {
	vdev_free(*vdp);
	*vdp = NULL;
	return (error);
	}
	}

	ASSERT(*vdp != NULL);

	return (0);
	}

	/*
	* Opposite of spa_load().
	*/
	static void
	spa_unload(spa_t *spa)
	{
	int i;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	/*
	* Stop async tasks.
	*/
	spa_async_suspend(spa);

	/*
	* Stop syncing.
	*/
	if (spa->spa_sync_on) {
	txg_sync_stop(spa->spa_dsl_pool);
	spa->spa_sync_on = B_FALSE;
	}

	/*
	* Wait for any outstanding async I/O to complete.
	*/
	if (spa->spa_async_zio_root != NULL) {
	for (int i = 0; i < max_ncpus; i++)
	(void) zio_wait(spa->spa_async_zio_root[i]);
	kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
	spa->spa_async_zio_root = NULL;
	}

	bpobj_close(&spa->spa_deferred_bpobj);

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	/*
	* Close all vdevs.
	*/
	if (spa->spa_root_vdev)
	vdev_free(spa->spa_root_vdev);
	ASSERT(spa->spa_root_vdev == NULL);

	/*
	* Close the dsl pool.
	*/
	if (spa->spa_dsl_pool) {
	dsl_pool_close(spa->spa_dsl_pool);
	spa->spa_dsl_pool = NULL;
	spa->spa_meta_objset = NULL;
	}

	ddt_unload(spa);


	/*
	* Drop and purge level 2 cache
	*/
	spa_l2cache_drop(spa);

	for (i = 0; i < spa->spa_spares.sav_count; i++)
	vdev_free(spa->spa_spares.sav_vdevs[i]);
	if (spa->spa_spares.sav_vdevs) {
	kmem_free(spa->spa_spares.sav_vdevs,
	spa->spa_spares.sav_count * sizeof (void *));
	spa->spa_spares.sav_vdevs = NULL;
	}
	if (spa->spa_spares.sav_config) {
	nvlist_free(spa->spa_spares.sav_config);
	spa->spa_spares.sav_config = NULL;
	}
	spa->spa_spares.sav_count = 0;

	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
	vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
	vdev_free(spa->spa_l2cache.sav_vdevs[i]);
	}
	if (spa->spa_l2cache.sav_vdevs) {
	kmem_free(spa->spa_l2cache.sav_vdevs,
	spa->spa_l2cache.sav_count * sizeof (void *));
	spa->spa_l2cache.sav_vdevs = NULL;
	}
	if (spa->spa_l2cache.sav_config) {
	nvlist_free(spa->spa_l2cache.sav_config);
	spa->spa_l2cache.sav_config = NULL;
	}
	spa->spa_l2cache.sav_count = 0;

	spa->spa_async_suspended = 0;

	if (spa->spa_comment != NULL) {
	spa_strfree(spa->spa_comment);
	spa->spa_comment = NULL;
	}

	spa_config_exit(spa, SCL_ALL, FTAG);
	}

	/*
	* Load (or re-load) the current list of vdevs describing the active spares for
	* this pool. When this is called, we have some form of basic information in
	* 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
	* then re-generate a more complete list including status information.
	*/
	static void
	spa_load_spares(spa_t *spa)
	{
	nvlist_t **spares;
	uint_t nspares;
	int i;
	vdev_t vd, tvd;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	/*
	* First, close and free any existing spare vdevs.
	*/
	for (i = 0; i < spa->spa_spares.sav_count; i++) {
	vd = spa->spa_spares.sav_vdevs[i];

	/* Undo the call to spa_activate() below */
	if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
	B_FALSE)) != NULL && tvd->vdev_isspare)
	spa_spare_remove(tvd);
	vdev_close(vd);
	vdev_free(vd);
	}

	if (spa->spa_spares.sav_vdevs)
	kmem_free(spa->spa_spares.sav_vdevs,
	spa->spa_spares.sav_count * sizeof (void *));

	if (spa->spa_spares.sav_config == NULL)
	nspares = 0;
	else
	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);

	spa->spa_spares.sav_count = (int)nspares;
	spa->spa_spares.sav_vdevs = NULL;

	if (nspares == 0)
	return;

	/*
	* Construct the array of vdevs, opening them to get status in the
	* process. For each spare, there is potentially two different vdev_t
	* structures associated with it: one in the list of spares (used only
	* for basic validation purposes) and one in the active vdev
	* configuration (if it's spared in). During this phase we open and
	* validate each vdev on the spare list. If the vdev also exists in the
	* active configuration, then we also mark this vdev as an active spare.
	*/
	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
	KM_SLEEP);
	for (i = 0; i < spa->spa_spares.sav_count; i++) {
	VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
	VDEV_ALLOC_SPARE) == 0);
	ASSERT(vd != NULL);

	spa->spa_spares.sav_vdevs[i] = vd;

	if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
	B_FALSE)) != NULL) {
	if (!tvd->vdev_isspare)
	spa_spare_add(tvd);

	/*
	* We only mark the spare active if we were successfully
	* able to load the vdev. Otherwise, importing a pool
	* with a bad active spare would result in strange
	* behavior, because multiple pool would think the spare
	* is actively in use.
	*
	* There is a vulnerability here to an equally bizarre
	* circumstance, where a dead active spare is later
	* brought back to life (onlined or otherwise). Given
	* the rarity of this scenario, and the extra complexity
	* it adds, we ignore the possibility.
	*/
	if (!vdev_is_dead(tvd))
	spa_spare_activate(tvd);
	}

	vd->vdev_top = vd;
	vd->vdev_aux = &spa->spa_spares;

	if (vdev_open(vd) != 0)
	continue;

	if (vdev_validate_aux(vd) == 0)
	spa_spare_add(vd);
	}

	/*
	* Recompute the stashed list of spares, with status information
	* this time.
	*/
	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
	DATA_TYPE_NVLIST_ARRAY) == 0);

	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
	KM_SLEEP);
	for (i = 0; i < spa->spa_spares.sav_count; i++)
	spares[i] = vdev_config_generate(spa,
	spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
	for (i = 0; i < spa->spa_spares.sav_count; i++)
	nvlist_free(spares[i]);
	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
	}

	/*
	* Load (or re-load) the current list of vdevs describing the active l2cache for
	* this pool. When this is called, we have some form of basic information in
	* 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
	* then re-generate a more complete list including status information.
	* Devices which are already active have their details maintained, and are
	* not re-opened.
	*/
	static void
	spa_load_l2cache(spa_t *spa)
	{
	nvlist_t **l2cache;
	uint_t nl2cache;
	int i, j, oldnvdevs;
	uint64_t guid;
	vdev_t vd, oldvdevs, *newvdevs;
	spa_aux_vdev_t *sav = &spa->spa_l2cache;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	if (sav->sav_config != NULL) {
	VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
	ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
	} else {
	nl2cache = 0;
	newvdevs = NULL;
	}

	oldvdevs = sav->sav_vdevs;
	oldnvdevs = sav->sav_count;
	sav->sav_vdevs = NULL;
	sav->sav_count = 0;

	/*
	* Process new nvlist of vdevs.
	*/
	for (i = 0; i < nl2cache; i++) {
	VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
	&guid) == 0);

	newvdevs[i] = NULL;
	for (j = 0; j < oldnvdevs; j++) {
	vd = oldvdevs[j];
	if (vd != NULL && guid == vd->vdev_guid) {
	/*
	* Retain previous vdev for add/remove ops.
	*/
	newvdevs[i] = vd;
	oldvdevs[j] = NULL;
	break;
	}
	}

	if (newvdevs[i] == NULL) {
	/*
	* Create new vdev
	*/
	VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
	VDEV_ALLOC_L2CACHE) == 0);
	ASSERT(vd != NULL);
	newvdevs[i] = vd;

	/*
	* Commit this vdev as an l2cache device,
	* even if it fails to open.
	*/
	spa_l2cache_add(vd);

	vd->vdev_top = vd;
	vd->vdev_aux = sav;

	spa_l2cache_activate(vd);

	if (vdev_open(vd) != 0)
	continue;

	(void) vdev_validate_aux(vd);

	if (!vdev_is_dead(vd))
	l2arc_add_vdev(spa, vd);
	}
	}

	/*
	* Purge vdevs that were dropped
	*/
	for (i = 0; i < oldnvdevs; i++) {
	uint64_t pool;

	vd = oldvdevs[i];
	if (vd != NULL) {
	ASSERT(vd->vdev_isl2cache);

	if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
	pool != 0ULL && l2arc_vdev_present(vd))
	l2arc_remove_vdev(vd);
	vdev_clear_stats(vd);
	vdev_free(vd);
	}
	}

	if (oldvdevs)
	kmem_free(oldvdevs, oldnvdevs * sizeof (void *));

	if (sav->sav_config == NULL)
	goto out;

	sav->sav_vdevs = newvdevs;
	sav->sav_count = (int)nl2cache;

	/*
	* Recompute the stashed list of l2cache devices, with status
	* information this time.
	*/
	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
	DATA_TYPE_NVLIST_ARRAY) == 0);

	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
	for (i = 0; i < sav->sav_count; i++)
	l2cache[i] = vdev_config_generate(spa,
	sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
	ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
	out:
	for (i = 0; i < sav->sav_count; i++)
	nvlist_free(l2cache[i]);
	if (sav->sav_count)
	kmem_free(l2cache, sav->sav_count * sizeof (void *));
	}

	static int
	load_nvlist(spa_t spa, uint64_t obj, nvlist_t *value)
	{
	dmu_buf_t *db;
	char *packed = NULL;
	size_t nvsize = 0;
	int error;
	*value = NULL;

	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
	nvsize = (uint64_t )db->db_data;
	dmu_buf_rele(db, FTAG);

	packed = kmem_alloc(nvsize, KM_SLEEP);
	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
	DMU_READ_PREFETCH);
	if (error == 0)
	error = nvlist_unpack(packed, nvsize, value, 0);
	kmem_free(packed, nvsize);

	return (error);
	}

	/*
	* Checks to see if the given vdev could not be opened, in which case we post a
	* sysevent to notify the autoreplace code that the device has been removed.
	*/
	static void
	spa_check_removed(vdev_t *vd)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	spa_check_removed(vd->vdev_child[c]);

	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
	!vd->vdev_ishole) {
	zfs_post_autoreplace(vd->vdev_spa, vd);
	spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
	}
	}

	/*
	* Validate the current config against the MOS config
	*/
	static boolean_t
	spa_config_valid(spa_t spa, nvlist_t config)
	{
	vdev_t mrvd, rvd = spa->spa_root_vdev;
	nvlist_t *nv;

	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);

	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);

	/*
	* If we're doing a normal import, then build up any additional
	* diagnostic information about missing devices in this config.
	* We'll pass this up to the user for further processing.
	*/
	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
	nvlist_t *child, nv;
	uint64_t idx = 0;

	child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
	KM_SLEEP);
	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	vdev_t *mtvd = mrvd->vdev_child[c];

	if (tvd->vdev_ops == &vdev_missing_ops &&
	mtvd->vdev_ops != &vdev_missing_ops &&
	mtvd->vdev_islog)
	child[idx++] = vdev_config_generate(spa, mtvd,
	B_FALSE, 0);
	}

	if (idx) {
	VERIFY(nvlist_add_nvlist_array(nv,
	ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
	VERIFY(nvlist_add_nvlist(spa->spa_load_info,
	ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);

	for (int i = 0; i < idx; i++)
	nvlist_free(child[i]);
	}
	nvlist_free(nv);
	kmem_free(child, rvd->vdev_children * sizeof (char **));
	}

	/*
	* Compare the root vdev tree with the information we have
	* from the MOS config (mrvd). Check each top-level vdev
	* with the corresponding MOS config top-level (mtvd).
	*/
	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	vdev_t *mtvd = mrvd->vdev_child[c];

	/*
	* Resolve any "missing" vdevs in the current configuration.
	* If we find that the MOS config has more accurate information
	* about the top-level vdev then use that vdev instead.
	*/
	if (tvd->vdev_ops == &vdev_missing_ops &&
	mtvd->vdev_ops != &vdev_missing_ops) {

	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
	continue;

	/*
	* Device specific actions.
	*/
	if (mtvd->vdev_islog) {
	spa_set_log_state(spa, SPA_LOG_CLEAR);
	} else {
	/*
	* XXX - once we have 'readonly' pool
	* support we should be able to handle
	* missing data devices by transitioning
	* the pool to readonly.
	*/
	continue;
	}

	/*
	* Swap the missing vdev with the data we were
	* able to obtain from the MOS config.
	*/
	vdev_remove_child(rvd, tvd);
	vdev_remove_child(mrvd, mtvd);

	vdev_add_child(rvd, mtvd);
	vdev_add_child(mrvd, tvd);

	spa_config_exit(spa, SCL_ALL, FTAG);
	vdev_load(mtvd);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	vdev_reopen(rvd);
	} else if (mtvd->vdev_islog) {
	/*
	* Load the slog device's state from the MOS config
	* since it's possible that the label does not
	* contain the most up-to-date information.
	*/
	vdev_load_log_state(tvd, mtvd);
	vdev_reopen(tvd);
	}
	}
	vdev_free(mrvd);
	spa_config_exit(spa, SCL_ALL, FTAG);

	/*
	* Ensure we were able to validate the config.
	*/
	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
	}

	/*
	* Check for missing log devices
	*/
	static boolean_t
	spa_check_logs(spa_t *spa)
	{
	boolean_t rv = B_FALSE;
	+ dsl_pool_t *dp = spa_get_dsl(spa);

	switch (spa->spa_log_state) {
	case SPA_LOG_MISSING:
	/* need to recheck in case slog has been restored */
	case SPA_LOG_UNKNOWN:
	- rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
	- NULL, DS_FIND_CHILDREN) != 0);
	+ rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	+ zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
	if (rv)
	spa_set_log_state(spa, SPA_LOG_MISSING);
	break;
	}
	return (rv);
	}

	static boolean_t
	spa_passivate_log(spa_t *spa)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	boolean_t slog_found = B_FALSE;

	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));

	if (!spa_has_slogs(spa))
	return (B_FALSE);

	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	if (tvd->vdev_islog) {
	metaslab_group_passivate(mg);
	slog_found = B_TRUE;
	}
	}

	return (slog_found);
	}

	static void
	spa_activate_log(spa_t *spa)
	{
	vdev_t *rvd = spa->spa_root_vdev;

	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));

	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *tvd = rvd->vdev_child[c];
	metaslab_group_t *mg = tvd->vdev_mg;

	if (tvd->vdev_islog)
	metaslab_group_activate(mg);
	}
	}

	int
	spa_offline_log(spa_t *spa)
	{
	int error;

	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
	NULL, DS_FIND_CHILDREN);
	if (error == 0) {
	/*
	* We successfully offlined the log device, sync out the
	* current txg so that the "stubby" block can be removed
	* by zil_sync().
	*/
	txg_wait_synced(spa->spa_dsl_pool, 0);
	}
	return (error);
	}

	static void
	spa_aux_check_removed(spa_aux_vdev_t *sav)
	{
	for (int i = 0; i < sav->sav_count; i++)
	spa_check_removed(sav->sav_vdevs[i]);
	}

	void
	spa_claim_notify(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;

	if (zio->io_error)
	return;

	mutex_enter(&spa->spa_props_lock); /* any mutex will do */
	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
	spa->spa_claim_max_txg = zio->io_bp->blk_birth;
	mutex_exit(&spa->spa_props_lock);
	}

	typedef struct spa_load_error {
	uint64_t sle_meta_count;
	uint64_t sle_data_count;
	} spa_load_error_t;

	static void
	spa_load_verify_done(zio_t *zio)
	{
	blkptr_t *bp = zio->io_bp;
	spa_load_error_t *sle = zio->io_private;
	dmu_object_type_t type = BP_GET_TYPE(bp);
	int error = zio->io_error;
	spa_t *spa = zio->io_spa;

	if (error) {
	if ((BP_GET_LEVEL(bp) != 0 \|\| DMU_OT_IS_METADATA(type)) &&
	type != DMU_OT_INTENT_LOG)
	atomic_inc_64(&sle->sle_meta_count);
	else
	atomic_inc_64(&sle->sle_data_count);
	}
	zio_data_buf_free(zio->io_data, zio->io_size);

	mutex_enter(&spa->spa_scrub_lock);
	spa->spa_scrub_inflight--;
	cv_broadcast(&spa->spa_scrub_io_cv);
	mutex_exit(&spa->spa_scrub_lock);
	}

	/*
	* Maximum number of concurrent scrub i/os to create while verifying
	* a pool while importing it.
	*/
	int spa_load_verify_maxinflight = 10000;
	boolean_t spa_load_verify_metadata = B_TRUE;
	boolean_t spa_load_verify_data = B_TRUE;

	/ARGSUSED/
	static int
	spa_load_verify_cb(spa_t spa, zilog_t zilog, const blkptr_t *bp,
	const zbookmark_phys_t zb, const dnode_phys_t dnp, void *arg)
	{
	if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp))
	return (0);
	/*
	* Note: normally this routine will not be called if
	* spa_load_verify_metadata is not set. However, it may be useful
	* to manually set the flag after the traversal has begun.
	*/
	if (!spa_load_verify_metadata)
	return (0);
	if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
	return (0);

	zio_t *rio = arg;
	size_t size = BP_GET_PSIZE(bp);
	void *data = zio_data_buf_alloc(size);

	mutex_enter(&spa->spa_scrub_lock);
	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
	cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
	spa->spa_scrub_inflight++;
	mutex_exit(&spa->spa_scrub_lock);

	zio_nowait(zio_read(rio, spa, bp, data, size,
	spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
	ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_CANFAIL \|
	ZIO_FLAG_SCRUB \| ZIO_FLAG_RAW, zb));
	return (0);
	}

	static int
	spa_load_verify(spa_t *spa)
	{
	zio_t *rio;
	spa_load_error_t sle = { 0 };
	zpool_rewind_policy_t policy;
	boolean_t verify_ok = B_FALSE;
	int error = 0;

	zpool_get_rewind_policy(spa->spa_config, &policy);

	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
	return (0);

	rio = zio_root(spa, NULL, &sle,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE);

	if (spa_load_verify_metadata) {
	error = traverse_pool(spa, spa->spa_verify_min_txg,
	TRAVERSE_PRE \| TRAVERSE_PREFETCH_METADATA,
	spa_load_verify_cb, rio);
	}

	(void) zio_wait(rio);

	spa->spa_load_meta_errors = sle.sle_meta_count;
	spa->spa_load_data_errors = sle.sle_data_count;

	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
	sle.sle_data_count <= policy.zrp_maxdata) {
	int64_t loss = 0;

	verify_ok = B_TRUE;
	spa->spa_load_txg = spa->spa_uberblock.ub_txg;
	spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;

	loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
	VERIFY(nvlist_add_uint64(spa->spa_load_info,
	ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
	VERIFY(nvlist_add_int64(spa->spa_load_info,
	ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
	VERIFY(nvlist_add_uint64(spa->spa_load_info,
	ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
	} else {
	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
	}

	if (error) {
	if (error != ENXIO && error != EIO)
	error = SET_ERROR(EIO);
	return (error);
	}

	return (verify_ok ? 0 : EIO);
	}

	/*
	* Find a value in the pool props object.
	*/
	static void
	spa_prop_find(spa_t spa, zpool_prop_t prop, uint64_t val)
	{
	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
	zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
	}

	/*
	* Find a value in the pool directory object.
	*/
	static int
	spa_dir_prop(spa_t spa, const char name, uint64_t *val)
	{
	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
	name, sizeof (uint64_t), 1, val));
	}

	static int
	spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
	{
	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
	return (err);
	}

	/*
	* Fix up config after a partly-completed split. This is done with the
	* ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
	* pool have that entry in their config, but only the splitting one contains
	* a list of all the guids of the vdevs that are being split off.
	*
	* This function determines what to do with that list: either rejoin
	* all the disks to the pool, or complete the splitting process. To attempt
	* the rejoin, each disk that is offlined is marked online again, and
	* we do a reopen() call. If the vdev label for every disk that was
	* marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
	* then we call vdev_split() on each disk, and complete the split.
	*
	* Otherwise we leave the config alone, with all the vdevs in place in
	* the original pool.
	*/
	static void
	spa_try_repair(spa_t spa, nvlist_t config)
	{
	uint_t extracted;
	uint64_t *glist;
	uint_t i, gcount;
	nvlist_t *nvl;
	vdev_t **vd;
	boolean_t attempt_reopen;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
	return;

	/* check that the config is complete */
	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
	&glist, &gcount) != 0)
	return;

	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);

	/* attempt to online all the vdevs & validate */
	attempt_reopen = B_TRUE;
	for (i = 0; i < gcount; i++) {
	if (glist[i] == 0) /* vdev is hole */
	continue;

	vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
	if (vd[i] == NULL) {
	/*
	* Don't bother attempting to reopen the disks;
	* just do the split.
	*/
	attempt_reopen = B_FALSE;
	} else {
	/* attempt to re-online it */
	vd[i]->vdev_offline = B_FALSE;
	}
	}

	if (attempt_reopen) {
	vdev_reopen(spa->spa_root_vdev);

	/* check each device to see what state it's in */
	for (extracted = 0, i = 0; i < gcount; i++) {
	if (vd[i] != NULL &&
	vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
	break;
	++extracted;
	}
	}

	/*
	* If every disk has been moved to the new pool, or if we never
	* even attempted to look at them, then we split them off for
	* good.
	*/
	if (!attempt_reopen \|\| gcount == extracted) {
	for (i = 0; i < gcount; i++)
	if (vd[i] != NULL)
	vdev_split(vd[i]);
	vdev_reopen(spa->spa_root_vdev);
	}

	kmem_free(vd, gcount * sizeof (vdev_t *));
	}

	static int
	spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
	boolean_t mosconfig)
	{
	nvlist_t *config = spa->spa_config;
	char *ereport = FM_EREPORT_ZFS_POOL;
	char *comment;
	int error;
	uint64_t pool_guid;
	nvlist_t *nvl;

	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
	return (SET_ERROR(EINVAL));

	ASSERT(spa->spa_comment == NULL);
	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
	spa->spa_comment = spa_strdup(comment);

	/*
	* Versioning wasn't explicitly added to the label until later, so if
	* it's not present treat it as the initial version.
	*/
	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
	&spa->spa_ubsync.ub_version) != 0)
	spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;

	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	&spa->spa_config_txg);

	if ((state == SPA_LOAD_IMPORT \|\| state == SPA_LOAD_TRYIMPORT) &&
	spa_guid_exists(pool_guid, 0)) {
	error = SET_ERROR(EEXIST);
	} else {
	spa->spa_config_guid = pool_guid;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
	&nvl) == 0) {
	VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
	KM_SLEEP) == 0);
	}

	nvlist_free(spa->spa_load_info);
	spa->spa_load_info = fnvlist_alloc();

	gethrestime(&spa->spa_loaded_ts);
	error = spa_load_impl(spa, pool_guid, config, state, type,
	mosconfig, &ereport);
	}

	/*
	* Don't count references from objsets that are already closed
	* and are making their way through the eviction process.
	*/
	spa_evicting_os_wait(spa);
	spa->spa_minref = refcount_count(&spa->spa_refcount);
	if (error) {
	if (error != EEXIST) {
	spa->spa_loaded_ts.tv_sec = 0;
	spa->spa_loaded_ts.tv_nsec = 0;
	}
	if (error != EBADF) {
	zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
	}
	}
	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
	spa->spa_ena = 0;

	return (error);
	}

	/*
	* Load an existing storage pool, using the pool's builtin spa_config as a
	* source of configuration information.
	*/
	static int
	spa_load_impl(spa_t spa, uint64_t pool_guid, nvlist_t config,
	spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
	char **ereport)
	{
	int error = 0;
	nvlist_t *nvroot = NULL;
	nvlist_t *label;
	vdev_t *rvd;
	uberblock_t *ub = &spa->spa_uberblock;
	uint64_t children, config_cache_txg = spa->spa_config_txg;
	int orig_mode = spa->spa_mode;
	int parse;
	uint64_t obj;
	boolean_t missing_feat_write = B_FALSE;

	/*
	* If this is an untrusted config, access the pool in read-only mode.
	* This prevents things like resilvering recently removed devices.
	*/
	if (!mosconfig)
	spa->spa_mode = FREAD;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));

	spa->spa_load_state = state;

	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
	return (SET_ERROR(EINVAL));

	parse = (type == SPA_IMPORT_EXISTING ?
	VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);

	/*
	* Create "The Godfather" zio to hold all async IOs
	*/
	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
	KM_SLEEP);
	for (int i = 0; i < max_ncpus; i++) {
	spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	ZIO_FLAG_GODFATHER);
	}

	/*
	* Parse the configuration into a vdev tree. We explicitly set the
	* value that will be returned by spa_version() since parsing the
	* configuration requires knowing the version number.
	*/
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
	spa_config_exit(spa, SCL_ALL, FTAG);

	if (error != 0)
	return (error);

	ASSERT(spa->spa_root_vdev == rvd);
	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);

	if (type != SPA_IMPORT_ASSEMBLE) {
	ASSERT(spa_guid(spa) == pool_guid);
	}

	/*
	* Try to open all vdevs, loading each label in the process.
	*/
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	error = vdev_open(rvd);
	spa_config_exit(spa, SCL_ALL, FTAG);
	if (error != 0)
	return (error);

	/*
	* We need to validate the vdev labels against the configuration that
	* we have in hand, which is dependent on the setting of mosconfig. If
	* mosconfig is true then we're validating the vdev labels based on
	* that config. Otherwise, we're validating against the cached config
	* (zpool.cache) that was read when we loaded the zfs module, and then
	* later we will recursively call spa_load() and validate against
	* the vdev config.
	*
	* If we're assembling a new pool that's been split off from an
	* existing pool, the labels haven't yet been updated so we skip
	* validation for now.
	*/
	if (type != SPA_IMPORT_ASSEMBLE) {
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	error = vdev_validate(rvd, mosconfig);
	spa_config_exit(spa, SCL_ALL, FTAG);

	if (error != 0)
	return (error);

	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
	return (SET_ERROR(ENXIO));
	}

	/*
	* Find the best uberblock.
	*/
	vdev_uberblock_load(rvd, ub, &label);

	/*
	* If we weren't able to find a single valid uberblock, return failure.
	*/
	if (ub->ub_txg == 0) {
	nvlist_free(label);
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
	}

	/*
	* If the pool has an unsupported version we can't open it.
	*/
	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
	nvlist_free(label);
	return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
	}

	if (ub->ub_version >= SPA_VERSION_FEATURES) {
	nvlist_t *features;

	/*
	* If we weren't able to find what's necessary for reading the
	* MOS in the label, return failure.
	*/
	if (label == NULL \|\| nvlist_lookup_nvlist(label,
	ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
	nvlist_free(label);
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
	ENXIO));
	}

	/*
	* Update our in-core representation with the definitive values
	* from the label.
	*/
	nvlist_free(spa->spa_label_features);
	VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
	}

	nvlist_free(label);

	/*
	* Look through entries in the label nvlist's features_for_read. If
	* there is a feature listed there which we don't understand then we
	* cannot open a pool.
	*/
	if (ub->ub_version >= SPA_VERSION_FEATURES) {
	nvlist_t *unsup_feat;

	VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
	0);

	for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
	NULL); nvp != NULL;
	nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
	if (!zfeature_is_supported(nvpair_name(nvp))) {
	VERIFY(nvlist_add_string(unsup_feat,
	nvpair_name(nvp), "") == 0);
	}
	}

	if (!nvlist_empty(unsup_feat)) {
	VERIFY(nvlist_add_nvlist(spa->spa_load_info,
	ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
	nvlist_free(unsup_feat);
	return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
	ENOTSUP));
	}

	nvlist_free(unsup_feat);
	}

	/*
	* If the vdev guid sum doesn't match the uberblock, we have an
	* incomplete configuration. We first check to see if the pool
	* is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
	* If it is, defer the vdev_guid_sum check till later so we
	* can handle missing vdevs.
	*/
	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
	&children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
	rvd->vdev_guid_sum != ub->ub_guid_sum)
	return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));

	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_try_repair(spa, config);
	spa_config_exit(spa, SCL_ALL, FTAG);
	nvlist_free(spa->spa_config_splitting);
	spa->spa_config_splitting = NULL;
	}

	/*
	* Initialize internal SPA structures.
	*/
	spa->spa_state = POOL_STATE_ACTIVE;
	spa->spa_ubsync = spa->spa_uberblock;
	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
	TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
	spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
	spa->spa_claim_max_txg = spa->spa_first_txg;
	spa->spa_prev_software_version = ub->ub_software_version;

	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
	if (error)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;

	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
	boolean_t missing_feat_read = B_FALSE;
	nvlist_t unsup_feat, enabled_feat;

	if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
	&spa->spa_feat_for_read_obj) != 0) {
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
	&spa->spa_feat_for_write_obj) != 0) {
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
	&spa->spa_feat_desc_obj) != 0) {
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	enabled_feat = fnvlist_alloc();
	unsup_feat = fnvlist_alloc();

	if (!spa_features_check(spa, B_FALSE,
	unsup_feat, enabled_feat))
	missing_feat_read = B_TRUE;

	if (spa_writeable(spa) \|\| state == SPA_LOAD_TRYIMPORT) {
	if (!spa_features_check(spa, B_TRUE,
	unsup_feat, enabled_feat)) {
	missing_feat_write = B_TRUE;
	}
	}

	fnvlist_add_nvlist(spa->spa_load_info,
	ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);

	if (!nvlist_empty(unsup_feat)) {
	fnvlist_add_nvlist(spa->spa_load_info,
	ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
	}

	fnvlist_free(enabled_feat);
	fnvlist_free(unsup_feat);

	if (!missing_feat_read) {
	fnvlist_add_boolean(spa->spa_load_info,
	ZPOOL_CONFIG_CAN_RDONLY);
	}

	/*
	* If the state is SPA_LOAD_TRYIMPORT, our objective is
	* twofold: to determine whether the pool is available for
	* import in read-write mode and (if it is not) whether the
	* pool is available for import in read-only mode. If the pool
	* is available for import in read-write mode, it is displayed
	* as available in userland; if it is not available for import
	* in read-only mode, it is displayed as unavailable in
	* userland. If the pool is available for import in read-only
	* mode but not read-write mode, it is displayed as unavailable
	* in userland with a special note that the pool is actually
	* available for open in read-only mode.
	*
	* As a result, if the state is SPA_LOAD_TRYIMPORT and we are
	* missing a feature for write, we must first determine whether
	* the pool can be opened read-only before returning to
	* userland in order to know whether to display the
	* abovementioned note.
	*/
	if (missing_feat_read \|\| (missing_feat_write &&
	spa_writeable(spa))) {
	return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
	ENOTSUP));
	}

	/*
	* Load refcounts for ZFS features from disk into an in-memory
	* cache during SPA initialization.
	*/
	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
	uint64_t refcount;

	error = feature_get_refcount_from_disk(spa,
	&spa_feature_table[i], &refcount);
	if (error == 0) {
	spa->spa_feat_refcount_cache[i] = refcount;
	} else if (error == ENOTSUP) {
	spa->spa_feat_refcount_cache[i] =
	SPA_FEATURE_DISABLED;
	} else {
	return (spa_vdev_err(rvd,
	VDEV_AUX_CORRUPT_DATA, EIO));
	}
	}
	}

	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
	if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
	&spa->spa_feat_enabled_txg_obj) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	}

	spa->spa_is_initializing = B_TRUE;
	error = dsl_pool_open(spa->spa_dsl_pool);
	spa->spa_is_initializing = B_FALSE;
	if (error != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	if (!mosconfig) {
	uint64_t hostid;
	nvlist_t policy = NULL, nvconfig;

	if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
	ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
	char *hostname;
	unsigned long myhostid = 0;

	VERIFY(nvlist_lookup_string(nvconfig,
	ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);

	#ifdef _KERNEL
	myhostid = zone_get_hostid(NULL);
	#else /* _KERNEL */
	/*
	* We're emulating the system's hostid in userland, so
	* we can't use zone_get_hostid().
	*/
	(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
	#endif /* _KERNEL */
	if (hostid != 0 && myhostid != 0 &&
	hostid != myhostid) {
	nvlist_free(nvconfig);
	cmn_err(CE_WARN, "pool '%s' could not be "
	"loaded as it was last accessed by "
	"another system (host: %s hostid: 0x%lx). "
	"See: http://illumos.org/msg/ZFS-8000-EY",
	spa_name(spa), hostname,
	(unsigned long)hostid);
	return (SET_ERROR(EBADF));
	}
	}
	if (nvlist_lookup_nvlist(spa->spa_config,
	ZPOOL_REWIND_POLICY, &policy) == 0)
	VERIFY(nvlist_add_nvlist(nvconfig,
	ZPOOL_REWIND_POLICY, policy) == 0);

	spa_config_set(spa, nvconfig);
	spa_unload(spa);
	spa_deactivate(spa);
	spa_activate(spa, orig_mode);

	return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
	}

	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
	if (error != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	/*
	* Load the bit that tells us to use the new accounting function
	* (raid-z deflation). If we have an older pool, this will not
	* be present.
	*/
	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
	&spa->spa_creation_version);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	/*
	* Load the persistent error log. If we have an older pool, this will
	* not be present.
	*/
	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
	&spa->spa_errlog_scrub);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	/*
	* Load the history object. If we have an older pool, this
	* will not be present.
	*/
	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	/*
	* If we're assembling the pool from the split-off vdevs of
	* an existing pool, we don't want to attach the spares & cache
	* devices.
	*/

	/*
	* Load any hot spares for this pool.
	*/
	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
	ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
	if (load_nvlist(spa, spa->spa_spares.sav_object,
	&spa->spa_spares.sav_config) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_spares(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	} else if (error == 0) {
	spa->spa_spares.sav_sync = B_TRUE;
	}

	/*
	* Load any level 2 ARC devices for this pool.
	*/
	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
	&spa->spa_l2cache.sav_object);
	if (error != 0 && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
	ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
	if (load_nvlist(spa, spa->spa_l2cache.sav_object,
	&spa->spa_l2cache.sav_config) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_l2cache(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	} else if (error == 0) {
	spa->spa_l2cache.sav_sync = B_TRUE;
	}

	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);

	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
	if (error && error != ENOENT)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	if (error == 0) {
	uint64_t autoreplace;

	spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
	spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
	spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
	spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
	spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
	spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
	&spa->spa_dedup_ditto);

	spa->spa_autoreplace = (autoreplace != 0);
	}

	/*
	* If the 'autoreplace' property is set, then post a resource notifying
	* the ZFS DE that it should not issue any faults for unopenable
	* devices. We also iterate over the vdevs, and post a sysevent for any
	* unopenable vdevs so that the normal autoreplace handler can take
	* over.
	*/
	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
	spa_check_removed(spa->spa_root_vdev);
	/*
	* For the import case, this is done in spa_import(), because
	* at this point we're using the spare definitions from
	* the MOS config, not necessarily from the userland config.
	*/
	if (state != SPA_LOAD_IMPORT) {
	spa_aux_check_removed(&spa->spa_spares);
	spa_aux_check_removed(&spa->spa_l2cache);
	}
	}

	/*
	* Load the vdev state for all toplevel vdevs.
	*/
	vdev_load(rvd);

	/*
	* Propagate the leaf DTLs we just loaded all the way up the tree.
	*/
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
	spa_config_exit(spa, SCL_ALL, FTAG);

	/*
	* Load the DDTs (dedup tables).
	*/
	error = ddt_load(spa);
	if (error != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	spa_update_dspace(spa);

	/*
	* Validate the config, using the MOS config to fill in any
	* information which might be missing. If we fail to validate
	* the config then declare the pool unfit for use. If we're
	* assembling a pool from a split, the log is not transferred
	* over.
	*/
	if (type != SPA_IMPORT_ASSEMBLE) {
	nvlist_t *nvconfig;

	if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));

	if (!spa_config_valid(spa, nvconfig)) {
	nvlist_free(nvconfig);
	return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
	ENXIO));
	}
	nvlist_free(nvconfig);

	/*
	* Now that we've validated the config, check the state of the
	* root vdev. If it can't be opened, it indicates one or
	* more toplevel vdevs are faulted.
	*/
	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
	return (SET_ERROR(ENXIO));

	if (spa_check_logs(spa)) {
	*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
	return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
	}
	}

	if (missing_feat_write) {
	ASSERT(state == SPA_LOAD_TRYIMPORT);

	/*
	* At this point, we know that we can open the pool in
	* read-only mode but not read-write mode. We now have enough
	* information and can return to userland.
	*/
	return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
	}

	/*
	* We've successfully opened the pool, verify that we're ready
	* to start pushing transactions.
	*/
	if (state != SPA_LOAD_TRYIMPORT) {
	if (error = spa_load_verify(spa))
	return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
	error));
	}

	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER \|\|
	spa->spa_load_max_txg == UINT64_MAX)) {
	dmu_tx_t *tx;
	int need_update = B_FALSE;
	+ dsl_pool_t *dp = spa_get_dsl(spa);

	ASSERT(state != SPA_LOAD_TRYIMPORT);

	/*
	* Claim log blocks that haven't been committed yet.
	* This must all happen in a single txg.
	* Note: spa_claim_max_txg is updated by spa_claim_notify(),
	* invoked from zil_claim_log_block()'s i/o done callback.
	* Price of rollback is that we abandon the log.
	*/
	spa->spa_claiming = B_TRUE;

	- tx = dmu_tx_create_assigned(spa_get_dsl(spa),
	- spa_first_txg(spa));
	- (void) dmu_objset_find(spa_name(spa),
	+ tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
	+ (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
	zil_claim, tx, DS_FIND_CHILDREN);
	dmu_tx_commit(tx);

	spa->spa_claiming = B_FALSE;

	spa_set_log_state(spa, SPA_LOG_GOOD);
	spa->spa_sync_on = B_TRUE;
	txg_sync_start(spa->spa_dsl_pool);

	/*
	* Wait for all claims to sync. We sync up to the highest
	* claimed log block birth time so that claimed log blocks
	* don't appear to be from the future. spa_claim_max_txg
	* will have been set for us by either zil_check_log_chain()
	* (invoked from spa_check_logs()) or zil_claim() above.
	*/
	txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);

	/*
	* If the config cache is stale, or we have uninitialized
	* metaslabs (see spa_vdev_add()), then update the config.
	*
	* If this is a verbatim import, trust the current
	* in-core spa_config and update the disk labels.
	*/
	if (config_cache_txg != spa->spa_config_txg \|\|
	state == SPA_LOAD_IMPORT \|\|
	state == SPA_LOAD_RECOVER \|\|
	(spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
	need_update = B_TRUE;

	for (int c = 0; c < rvd->vdev_children; c++)
	if (rvd->vdev_child[c]->vdev_ms_array == 0)
	need_update = B_TRUE;

	/*
	* Update the config cache asychronously in case we're the
	* root pool, in which case the config cache isn't writable yet.
	*/
	if (need_update)
	spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);

	/*
	* Check all DTLs to see if anything needs resilvering.
	*/
	if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
	vdev_resilver_needed(rvd, NULL, NULL))
	spa_async_request(spa, SPA_ASYNC_RESILVER);

	/*
	* Log the fact that we booted up (so that we can detect if
	* we rebooted in the middle of an operation).
	*/
	spa_history_log_version(spa, "open");

	/*
	* Delete any inconsistent datasets.
	*/
	(void) dmu_objset_find(spa_name(spa),
	dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);

	/*
	* Clean up any stale temporary dataset userrefs.
	*/
	dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
	}

	return (0);
	}

	static int
	spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
	{
	int mode = spa->spa_mode;

	spa_unload(spa);
	spa_deactivate(spa);

	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;

	spa_activate(spa, mode);
	spa_async_suspend(spa);

	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
	}

	/*
	* If spa_load() fails this function will try loading prior txg's. If
	* 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
	* will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
	* function will not rewind the pool and will return the same error as
	* spa_load().
	*/
	static int
	spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
	uint64_t max_request, int rewind_flags)
	{
	nvlist_t *loadinfo = NULL;
	nvlist_t *config = NULL;
	int load_error, rewind_error;
	uint64_t safe_rewind_txg;
	uint64_t min_txg;

	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
	spa->spa_load_max_txg = spa->spa_load_txg;
	spa_set_log_state(spa, SPA_LOG_CLEAR);
	} else {
	spa->spa_load_max_txg = max_request;
	if (max_request != UINT64_MAX)
	spa->spa_extreme_rewind = B_TRUE;
	}

	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
	mosconfig);
	if (load_error == 0)
	return (0);

	if (spa->spa_root_vdev != NULL)
	config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);

	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;

	if (rewind_flags & ZPOOL_NEVER_REWIND) {
	nvlist_free(config);
	return (load_error);
	}

	if (state == SPA_LOAD_RECOVER) {
	/* Price of rolling back is discarding txgs, including log */
	spa_set_log_state(spa, SPA_LOG_CLEAR);
	} else {
	/*
	* If we aren't rolling back save the load info from our first
	* import attempt so that we can restore it after attempting
	* to rewind.
	*/
	loadinfo = spa->spa_load_info;
	spa->spa_load_info = fnvlist_alloc();
	}

	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
	TXG_INITIAL : safe_rewind_txg;

	/*
	* Continue as long as we're finding errors, we're still within
	* the acceptable rewind range, and we're still finding uberblocks
	*/
	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
	spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
	if (spa->spa_load_max_txg < safe_rewind_txg)
	spa->spa_extreme_rewind = B_TRUE;
	rewind_error = spa_load_retry(spa, state, mosconfig);
	}

	spa->spa_extreme_rewind = B_FALSE;
	spa->spa_load_max_txg = UINT64_MAX;

	if (config && (rewind_error \|\| state != SPA_LOAD_RECOVER))
	spa_config_set(spa, config);

	if (state == SPA_LOAD_RECOVER) {
	ASSERT3P(loadinfo, ==, NULL);
	return (rewind_error);
	} else {
	/* Store the rewind info as part of the initial load info */
	fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
	spa->spa_load_info);

	/* Restore the initial load info */
	fnvlist_free(spa->spa_load_info);
	spa->spa_load_info = loadinfo;

	return (load_error);
	}
	}

	/*
	* Pool Open/Import
	*
	* The import case is identical to an open except that the configuration is sent
	* down from userland, instead of grabbed from the configuration cache. For the
	* case of an open, the pool configuration will exist in the
	* POOL_STATE_UNINITIALIZED state.
	*
	* The stats information (gen/count/ustats) is used to gather vdev statistics at
	* the same time open the pool, without having to keep around the spa_t in some
	* ambiguous state.
	*/
	static int
	spa_open_common(const char pool, spa_t spapp, void tag, nvlist_t *nvpolicy,
	nvlist_t **config)
	{
	spa_t *spa;
	spa_load_state_t state = SPA_LOAD_OPEN;
	int error;
	int locked = B_FALSE;

	*spapp = NULL;

	/*
	* As disgusting as this is, we need to support recursive calls to this
	* function because dsl_dir_open() is called during spa_load(), and ends
	* up calling spa_open() again. The real fix is to figure out how to
	* avoid dsl_dir_open() calling this in the first place.
	*/
	if (mutex_owner(&spa_namespace_lock) != curthread) {
	mutex_enter(&spa_namespace_lock);
	locked = B_TRUE;
	}

	if ((spa = spa_lookup(pool)) == NULL) {
	if (locked)
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(ENOENT));
	}

	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
	zpool_rewind_policy_t policy;

	zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
	&policy);
	if (policy.zrp_request & ZPOOL_DO_REWIND)
	state = SPA_LOAD_RECOVER;

	spa_activate(spa, spa_mode_global);

	if (state != SPA_LOAD_RECOVER)
	spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;

	error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
	policy.zrp_request);

	if (error == EBADF) {
	/*
	* If vdev_validate() returns failure (indicated by
	* EBADF), it indicates that one of the vdevs indicates
	* that the pool has been exported or destroyed. If
	* this is the case, the config cache is out of sync and
	* we should remove the pool from the namespace.
	*/
	spa_unload(spa);
	spa_deactivate(spa);
	spa_config_sync(spa, B_TRUE, B_TRUE);
	spa_remove(spa);
	if (locked)
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(ENOENT));
	}

	if (error) {
	/*
	* We can't open the pool, but we still have useful
	* information: the state of each vdev after the
	* attempted vdev_open(). Return this to the user.
	*/
	if (config != NULL && spa->spa_config) {
	VERIFY(nvlist_dup(spa->spa_config, config,
	KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist(*config,
	ZPOOL_CONFIG_LOAD_INFO,
	spa->spa_load_info) == 0);
	}
	spa_unload(spa);
	spa_deactivate(spa);
	spa->spa_last_open_failed = error;
	if (locked)
	mutex_exit(&spa_namespace_lock);
	*spapp = NULL;
	return (error);
	}
	}

	spa_open_ref(spa, tag);

	if (config != NULL)
	*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);

	/*
	* If we've recovered the pool, pass back any information we
	* gathered while doing the load.
	*/
	if (state == SPA_LOAD_RECOVER) {
	VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
	spa->spa_load_info) == 0);
	}

	if (locked) {
	spa->spa_last_open_failed = 0;
	spa->spa_last_ubsync_txg = 0;
	spa->spa_load_txg = 0;
	mutex_exit(&spa_namespace_lock);
	}

	*spapp = spa;

	return (0);
	}

	int
	spa_open_rewind(const char name, spa_t spapp, void tag, nvlist_t *policy,
	nvlist_t **config)
	{
	return (spa_open_common(name, spapp, tag, policy, config));
	}

	int
	spa_open(const char name, spa_t spapp, void tag)
	{
	return (spa_open_common(name, spapp, tag, NULL, NULL));
	}

	/*
	* Lookup the given spa_t, incrementing the inject count in the process,
	* preventing it from being exported or destroyed.
	*/
	spa_t *
	spa_inject_addref(char *name)
	{
	spa_t *spa;

	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(name)) == NULL) {
	mutex_exit(&spa_namespace_lock);
	return (NULL);
	}
	spa->spa_inject_ref++;
	mutex_exit(&spa_namespace_lock);

	return (spa);
	}

	void
	spa_inject_delref(spa_t *spa)
	{
	mutex_enter(&spa_namespace_lock);
	spa->spa_inject_ref--;
	mutex_exit(&spa_namespace_lock);
	}

	/*
	* Add spares device information to the nvlist.
	*/
	static void
	spa_add_spares(spa_t spa, nvlist_t config)
	{
	nvlist_t **spares;
	uint_t i, nspares;
	nvlist_t *nvroot;
	uint64_t guid;
	vdev_stat_t *vs;
	uint_t vsc;
	uint64_t pool;

	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));

	if (spa->spa_spares.sav_count == 0)
	return;

	VERIFY(nvlist_lookup_nvlist(config,
	ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
	if (nspares != 0) {
	VERIFY(nvlist_add_nvlist_array(nvroot,
	ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	VERIFY(nvlist_lookup_nvlist_array(nvroot,
	ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);

	/*
	* Go through and find any spares which have since been
	* repurposed as an active spare. If this is the case, update
	* their status appropriately.
	*/
	for (i = 0; i < nspares; i++) {
	VERIFY(nvlist_lookup_uint64(spares[i],
	ZPOOL_CONFIG_GUID, &guid) == 0);
	if (spa_spare_exists(guid, &pool, NULL) &&
	pool != 0ULL) {
	VERIFY(nvlist_lookup_uint64_array(
	spares[i], ZPOOL_CONFIG_VDEV_STATS,
	(uint64_t **)&vs, &vsc) == 0);
	vs->vs_state = VDEV_STATE_CANT_OPEN;
	vs->vs_aux = VDEV_AUX_SPARED;
	}
	}
	}
	}

	/*
	* Add l2cache device information to the nvlist, including vdev stats.
	*/
	static void
	spa_add_l2cache(spa_t spa, nvlist_t config)
	{
	nvlist_t **l2cache;
	uint_t i, j, nl2cache;
	nvlist_t *nvroot;
	uint64_t guid;
	vdev_t *vd;
	vdev_stat_t *vs;
	uint_t vsc;

	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));

	if (spa->spa_l2cache.sav_count == 0)
	return;

	VERIFY(nvlist_lookup_nvlist(config,
	ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
	if (nl2cache != 0) {
	VERIFY(nvlist_add_nvlist_array(nvroot,
	ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	VERIFY(nvlist_lookup_nvlist_array(nvroot,
	ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);

	/*
	* Update level 2 cache device stats.
	*/

	for (i = 0; i < nl2cache; i++) {
	VERIFY(nvlist_lookup_uint64(l2cache[i],
	ZPOOL_CONFIG_GUID, &guid) == 0);

	vd = NULL;
	for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
	if (guid ==
	spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
	vd = spa->spa_l2cache.sav_vdevs[j];
	break;
	}
	}
	ASSERT(vd != NULL);

	VERIFY(nvlist_lookup_uint64_array(l2cache[i],
	ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
	== 0);
	vdev_get_stats(vd, vs);
	}
	}
	}

	static void
	spa_add_feature_stats(spa_t spa, nvlist_t config)
	{
	nvlist_t *features;
	zap_cursor_t zc;
	zap_attribute_t za;

	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);

	if (spa->spa_feat_for_read_obj != 0) {
	for (zap_cursor_init(&zc, spa->spa_meta_objset,
	spa->spa_feat_for_read_obj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	ASSERT(za.za_integer_length == sizeof (uint64_t) &&
	za.za_num_integers == 1);
	VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
	za.za_first_integer));
	}
	zap_cursor_fini(&zc);
	}

	if (spa->spa_feat_for_write_obj != 0) {
	for (zap_cursor_init(&zc, spa->spa_meta_objset,
	spa->spa_feat_for_write_obj);
	zap_cursor_retrieve(&zc, &za) == 0;
	zap_cursor_advance(&zc)) {
	ASSERT(za.za_integer_length == sizeof (uint64_t) &&
	za.za_num_integers == 1);
	VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
	za.za_first_integer));
	}
	zap_cursor_fini(&zc);
	}

	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
	features) == 0);
	nvlist_free(features);
	}

	int
	spa_get_stats(const char name, nvlist_t *config,
	char *altroot, size_t buflen)
	{
	int error;
	spa_t *spa;

	*config = NULL;
	error = spa_open_common(name, &spa, FTAG, NULL, config);

	if (spa != NULL) {
	/*
	* This still leaves a window of inconsistency where the spares
	* or l2cache devices could change and the config would be
	* self-inconsistent.
	*/
	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

	if (*config != NULL) {
	uint64_t loadtimes[2];

	loadtimes[0] = spa->spa_loaded_ts.tv_sec;
	loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
	VERIFY(nvlist_add_uint64_array(*config,
	ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);

	VERIFY(nvlist_add_uint64(*config,
	ZPOOL_CONFIG_ERRCOUNT,
	spa_get_errlog_size(spa)) == 0);

	if (spa_suspended(spa))
	VERIFY(nvlist_add_uint64(*config,
	ZPOOL_CONFIG_SUSPENDED,
	spa->spa_failmode) == 0);

	spa_add_spares(spa, *config);
	spa_add_l2cache(spa, *config);
	spa_add_feature_stats(spa, *config);
	}
	}

	/*
	* We want to get the alternate root even for faulted pools, so we cheat
	* and call spa_lookup() directly.
	*/
	if (altroot) {
	if (spa == NULL) {
	mutex_enter(&spa_namespace_lock);
	spa = spa_lookup(name);
	if (spa)
	spa_altroot(spa, altroot, buflen);
	else
	altroot[0] = '\0';
	spa = NULL;
	mutex_exit(&spa_namespace_lock);
	} else {
	spa_altroot(spa, altroot, buflen);
	}
	}

	if (spa != NULL) {
	spa_config_exit(spa, SCL_CONFIG, FTAG);
	spa_close(spa, FTAG);
	}

	return (error);
	}

	/*
	* Validate that the auxiliary device array is well formed. We must have an
	* array of nvlists, each which describes a valid leaf vdev. If this is an
	* import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
	* specified, as long as they are well-formed.
	*/
	static int
	spa_validate_aux_devs(spa_t spa, nvlist_t nvroot, uint64_t crtxg, int mode,
	spa_aux_vdev_t sav, const char config, uint64_t version,
	vdev_labeltype_t label)
	{
	nvlist_t **dev;
	uint_t i, ndev;
	vdev_t *vd;
	int error;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	/*
	* It's acceptable to have no devs specified.
	*/
	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
	return (0);

	if (ndev == 0)
	return (SET_ERROR(EINVAL));

	/*
	* Make sure the pool is formatted with a version that supports this
	* device type.
	*/
	if (spa_version(spa) < version)
	return (SET_ERROR(ENOTSUP));

	/*
	* Set the pending device list so we correctly handle device in-use
	* checking.
	*/
	sav->sav_pending = dev;
	sav->sav_npending = ndev;

	for (i = 0; i < ndev; i++) {
	if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
	mode)) != 0)
	goto out;

	if (!vd->vdev_ops->vdev_op_leaf) {
	vdev_free(vd);
	error = SET_ERROR(EINVAL);
	goto out;
	}

	/*
	* The L2ARC currently only supports disk devices in
	* kernel context. For user-level testing, we allow it.
	*/
	#ifdef _KERNEL
	if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
	strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
	error = SET_ERROR(ENOTBLK);
	vdev_free(vd);
	goto out;
	}
	#endif
	vd->vdev_top = vd;

	if ((error = vdev_open(vd)) == 0 &&
	(error = vdev_label_init(vd, crtxg, label)) == 0) {
	VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
	vd->vdev_guid) == 0);
	}

	vdev_free(vd);

	if (error &&
	(mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
	goto out;
	else
	error = 0;
	}

	out:
	sav->sav_pending = NULL;
	sav->sav_npending = 0;
	return (error);
	}

	static int
	spa_validate_aux(spa_t spa, nvlist_t nvroot, uint64_t crtxg, int mode)
	{
	int error;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
	&spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
	VDEV_LABEL_SPARE)) != 0) {
	return (error);
	}

	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
	&spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
	VDEV_LABEL_L2CACHE));
	}

	static void
	spa_set_aux_vdevs(spa_aux_vdev_t sav, nvlist_t *devs, int ndevs,
	const char *config)
	{
	int i;

	if (sav->sav_config != NULL) {
	nvlist_t **olddevs;
	uint_t oldndevs;
	nvlist_t **newdevs;

	/*
	* Generate new dev list by concatentating with the
	* current dev list.
	*/
	VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
	&olddevs, &oldndevs) == 0);

	newdevs = kmem_alloc(sizeof (void )
	(ndevs + oldndevs), KM_SLEEP);
	for (i = 0; i < oldndevs; i++)
	VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
	KM_SLEEP) == 0);
	for (i = 0; i < ndevs; i++)
	VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
	KM_SLEEP) == 0);

	VERIFY(nvlist_remove(sav->sav_config, config,
	DATA_TYPE_NVLIST_ARRAY) == 0);

	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
	config, newdevs, ndevs + oldndevs) == 0);
	for (i = 0; i < oldndevs + ndevs; i++)
	nvlist_free(newdevs[i]);
	kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
	} else {
	/*
	* Generate a new dev list.
	*/
	VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
	KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
	devs, ndevs) == 0);
	}
	}

	/*
	* Stop and drop level 2 ARC devices
	*/
	void
	spa_l2cache_drop(spa_t *spa)
	{
	vdev_t *vd;
	int i;
	spa_aux_vdev_t *sav = &spa->spa_l2cache;

	for (i = 0; i < sav->sav_count; i++) {
	uint64_t pool;

	vd = sav->sav_vdevs[i];
	ASSERT(vd != NULL);

	if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
	pool != 0ULL && l2arc_vdev_present(vd))
	l2arc_remove_vdev(vd);
	}
	}

	/*
	* Pool Creation
	*/
	int
	spa_create(const char pool, nvlist_t nvroot, nvlist_t *props,
	nvlist_t *zplprops)
	{
	spa_t *spa;
	char *altroot = NULL;
	vdev_t *rvd;
	dsl_pool_t *dp;
	dmu_tx_t *tx;
	int error = 0;
	uint64_t txg = TXG_INITIAL;
	nvlist_t spares, l2cache;
	uint_t nspares, nl2cache;
	uint64_t version, obj;
	boolean_t has_features;

	/*
	* If this pool already exists, return failure.
	*/
	mutex_enter(&spa_namespace_lock);
	if (spa_lookup(pool) != NULL) {
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EEXIST));
	}

	/*
	* Allocate a new spa_t structure.
	*/
	(void) nvlist_lookup_string(props,
	zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
	spa = spa_add(pool, NULL, altroot);
	spa_activate(spa, spa_mode_global);

	if (props && (error = spa_prop_validate(spa, props))) {
	spa_deactivate(spa);
	spa_remove(spa);
	mutex_exit(&spa_namespace_lock);
	return (error);
	}

	has_features = B_FALSE;
	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
	elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
	if (zpool_prop_feature(nvpair_name(elem)))
	has_features = B_TRUE;
	}

	if (has_features \|\| nvlist_lookup_uint64(props,
	zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
	version = SPA_VERSION;
	}
	ASSERT(SPA_VERSION_IS_SUPPORTED(version));

	spa->spa_first_txg = txg;
	spa->spa_uberblock.ub_txg = txg - 1;
	spa->spa_uberblock.ub_version = version;
	spa->spa_ubsync = spa->spa_uberblock;

	/*
	* Create "The Godfather" zio to hold all async IOs
	*/
	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
	KM_SLEEP);
	for (int i = 0; i < max_ncpus; i++) {
	spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \|
	ZIO_FLAG_GODFATHER);
	}

	/*
	* Create the root vdev.
	*/
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);

	ASSERT(error != 0 \|\| rvd != NULL);
	ASSERT(error != 0 \|\| spa->spa_root_vdev == rvd);

	if (error == 0 && !zfs_allocatable_devs(nvroot))
	error = SET_ERROR(EINVAL);

	if (error == 0 &&
	(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
	(error = spa_validate_aux(spa, nvroot, txg,
	VDEV_ALLOC_ADD)) == 0) {
	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_metaslab_set_size(rvd->vdev_child[c]);
	vdev_expand(rvd->vdev_child[c], txg);
	}
	}

	spa_config_exit(spa, SCL_ALL, FTAG);

	if (error != 0) {
	spa_unload(spa);
	spa_deactivate(spa);
	spa_remove(spa);
	mutex_exit(&spa_namespace_lock);
	return (error);
	}

	/*
	* Get the list of spares, if specified.
	*/
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	&spares, &nspares) == 0) {
	VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
	KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_spares(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	spa->spa_spares.sav_sync = B_TRUE;
	}

	/*
	* Get the list of level 2 cache devices, if specified.
	*/
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	&l2cache, &nl2cache) == 0) {
	VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_l2cache(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	spa->spa_l2cache.sav_sync = B_TRUE;
	}

	spa->spa_is_initializing = B_TRUE;
	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
	spa->spa_meta_objset = dp->dp_meta_objset;
	spa->spa_is_initializing = B_FALSE;

	/*
	* Create DDTs (dedup tables).
	*/
	ddt_create(spa);

	spa_update_dspace(spa);

	tx = dmu_tx_create_assigned(dp, txg);

	/*
	* Create the pool config object.
	*/
	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
	DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
	DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);

	if (zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
	sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
	cmn_err(CE_PANIC, "failed to add pool config");
	}

	if (spa_version(spa) >= SPA_VERSION_FEATURES)
	spa_feature_create_zap_objects(spa, tx);

	if (zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
	sizeof (uint64_t), 1, &version, tx) != 0) {
	cmn_err(CE_PANIC, "failed to add pool version");
	}

	/* Newly created pools with the right version are always deflated. */
	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
	spa->spa_deflate = TRUE;
	if (zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
	sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
	cmn_err(CE_PANIC, "failed to add deflate");
	}
	}

	/*
	* Create the deferred-free bpobj. Turn off compression
	* because sync-to-convergence takes longer if the blocksize
	* keeps changing.
	*/
	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
	dmu_object_set_compress(spa->spa_meta_objset, obj,
	ZIO_COMPRESS_OFF, tx);
	if (zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
	sizeof (uint64_t), 1, &obj, tx) != 0) {
	cmn_err(CE_PANIC, "failed to add bpobj");
	}
	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
	spa->spa_meta_objset, obj));

	/*
	* Create the pool's history object.
	*/
	if (version >= SPA_VERSION_ZPOOL_HISTORY)
	spa_history_create_obj(spa, tx);

	/*
	* Set pool properties.
	*/
	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);

	if (props != NULL) {
	spa_configfile_set(spa, props, B_FALSE);
	spa_sync_props(props, tx);
	}

	dmu_tx_commit(tx);

	spa->spa_sync_on = B_TRUE;
	txg_sync_start(spa->spa_dsl_pool);

	/*
	* We explicitly wait for the first transaction to complete so that our
	* bean counters are appropriately updated.
	*/
	txg_wait_synced(spa->spa_dsl_pool, txg);

	spa_config_sync(spa, B_FALSE, B_TRUE);

	spa_history_log_version(spa, "create");

	/*
	* Don't count references from objsets that are already closed
	* and are making their way through the eviction process.
	*/
	spa_evicting_os_wait(spa);
	spa->spa_minref = refcount_count(&spa->spa_refcount);

	mutex_exit(&spa_namespace_lock);

	return (0);
	}

	#ifdef _KERNEL
	/*
	* Get the root pool information from the root disk, then import the root pool
	* during the system boot up time.
	*/
	extern int vdev_disk_read_rootlabel(char , char , nvlist_t **);

	static nvlist_t *
	spa_generate_rootconf(char devpath, char devid, uint64_t *guid)
	{
	nvlist_t *config;
	nvlist_t nvtop, nvroot;
	uint64_t pgid;

	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
	return (NULL);

	/*
	* Add this top-level vdev to the child array.
	*/
	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvtop) == 0);
	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	&pgid) == 0);
	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);

	/*
	* Put this pool's top-level vdevs into a root vdev.
	*/
	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
	VDEV_TYPE_ROOT) == 0);
	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
	&nvtop, 1) == 0);

	/*
	* Replace the existing vdev_tree with the new root vdev in
	* this pool's configuration (remove the old, add the new).
	*/
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
	nvlist_free(nvroot);
	return (config);
	}

	/*
	* Walk the vdev tree and see if we can find a device with "better"
	* configuration. A configuration is "better" if the label on that
	* device has a more recent txg.
	*/
	static void
	spa_alt_rootvdev(vdev_t vd, vdev_t avd, uint64_t txg)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	spa_alt_rootvdev(vd->vdev_child[c], avd, txg);

	if (vd->vdev_ops->vdev_op_leaf) {
	nvlist_t *label;
	uint64_t label_txg;

	if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
	&label) != 0)
	return;

	VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
	&label_txg) == 0);

	/*
	* Do we have a better boot device?
	*/
	if (label_txg > *txg) {
	*txg = label_txg;
	*avd = vd;
	}
	nvlist_free(label);
	}
	}

	/*
	* Import a root pool.
	*
	* For x86. devpath_list will consist of devid and/or physpath name of
	* the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
	* The GRUB "findroot" command will return the vdev we should boot.
	*
	* For Sparc, devpath_list consists the physpath name of the booting device
	* no matter the rootpool is a single device pool or a mirrored pool.
	* e.g.
	* "/pci@1f,0/ide@d/disk@0,0:a"
	*/
	int
	spa_import_rootpool(char devpath, char devid)
	{
	spa_t *spa;
	vdev_t rvd, bvd, *avd = NULL;
	nvlist_t config, nvtop;
	uint64_t guid, txg;
	char *pname;
	int error;

	/*
	* Read the label from the boot device and generate a configuration.
	*/
	config = spa_generate_rootconf(devpath, devid, &guid);
	#if defined(_OBP) && defined(_KERNEL)
	if (config == NULL) {
	if (strstr(devpath, "/iscsi/ssd") != NULL) {
	/* iscsi boot */
	get_iscsi_bootpath_phy(devpath);
	config = spa_generate_rootconf(devpath, devid, &guid);
	}
	}
	#endif
	if (config == NULL) {
	cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
	devpath);
	return (SET_ERROR(EIO));
	}

	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
	&pname) == 0);
	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);

	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(pname)) != NULL) {
	/*
	* Remove the existing root pool from the namespace so that we
	* can replace it with the correct config we just read in.
	*/
	spa_remove(spa);
	}

	spa = spa_add(pname, config, NULL);
	spa->spa_is_root = B_TRUE;
	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;

	/*
	* Build up a vdev tree based on the boot device's label config.
	*/
	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvtop) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
	VDEV_ALLOC_ROOTPOOL);
	spa_config_exit(spa, SCL_ALL, FTAG);
	if (error) {
	mutex_exit(&spa_namespace_lock);
	nvlist_free(config);
	cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
	pname);
	return (error);
	}

	/*
	* Get the boot vdev.
	*/
	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
	cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
	(u_longlong_t)guid);
	error = SET_ERROR(ENOENT);
	goto out;
	}

	/*
	* Determine if there is a better boot device.
	*/
	avd = bvd;
	spa_alt_rootvdev(rvd, &avd, &txg);
	if (avd != bvd) {
	cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
	"try booting from '%s'", avd->vdev_path);
	error = SET_ERROR(EINVAL);
	goto out;
	}

	/*
	* If the boot device is part of a spare vdev then ensure that
	* we're booting off the active spare.
	*/
	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	!bvd->vdev_isspare) {
	cmn_err(CE_NOTE, "The boot device is currently spared. Please "
	"try booting from '%s'",
	bvd->vdev_parent->
	vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
	error = SET_ERROR(EINVAL);
	goto out;
	}

	error = 0;
	out:
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	vdev_free(rvd);
	spa_config_exit(spa, SCL_ALL, FTAG);
	mutex_exit(&spa_namespace_lock);

	nvlist_free(config);
	return (error);
	}

	#endif

	/*
	* Import a non-root pool into the system.
	*/
	int
	spa_import(const char pool, nvlist_t config, nvlist_t *props, uint64_t flags)
	{
	spa_t *spa;
	char *altroot = NULL;
	spa_load_state_t state = SPA_LOAD_IMPORT;
	zpool_rewind_policy_t policy;
	uint64_t mode = spa_mode_global;
	uint64_t readonly = B_FALSE;
	int error;
	nvlist_t *nvroot;
	nvlist_t spares, l2cache;
	uint_t nspares, nl2cache;

	/*
	* If a pool with this name exists, return failure.
	*/
	mutex_enter(&spa_namespace_lock);
	if (spa_lookup(pool) != NULL) {
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EEXIST));
	}

	/*
	* Create and initialize the spa structure.
	*/
	(void) nvlist_lookup_string(props,
	zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
	(void) nvlist_lookup_uint64(props,
	zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
	if (readonly)
	mode = FREAD;
	spa = spa_add(pool, config, altroot);
	spa->spa_import_flags = flags;

	/*
	* Verbatim import - Take a pool and insert it into the namespace
	* as if it had been loaded at boot.
	*/
	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
	if (props != NULL)
	spa_configfile_set(spa, props, B_FALSE);

	spa_config_sync(spa, B_FALSE, B_TRUE);

	mutex_exit(&spa_namespace_lock);
	return (0);
	}

	spa_activate(spa, mode);

	/*
	* Don't start async tasks until we know everything is healthy.
	*/
	spa_async_suspend(spa);

	zpool_get_rewind_policy(config, &policy);
	if (policy.zrp_request & ZPOOL_DO_REWIND)
	state = SPA_LOAD_RECOVER;

	/*
	* Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
	* because the user-supplied config is actually the one to trust when
	* doing an import.
	*/
	if (state != SPA_LOAD_RECOVER)
	spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;

	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
	policy.zrp_request);

	/*
	* Propagate anything learned while loading the pool and pass it
	* back to caller (i.e. rewind info, missing devices, etc).
	*/
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
	spa->spa_load_info) == 0);

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	/*
	* Toss any existing sparelist, as it doesn't have any validity
	* anymore, and conflicts with spa_has_spare().
	*/
	if (spa->spa_spares.sav_config) {
	nvlist_free(spa->spa_spares.sav_config);
	spa->spa_spares.sav_config = NULL;
	spa_load_spares(spa);
	}
	if (spa->spa_l2cache.sav_config) {
	nvlist_free(spa->spa_l2cache.sav_config);
	spa->spa_l2cache.sav_config = NULL;
	spa_load_l2cache(spa);
	}

	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
	&nvroot) == 0);
	if (error == 0)
	error = spa_validate_aux(spa, nvroot, -1ULL,
	VDEV_ALLOC_SPARE);
	if (error == 0)
	error = spa_validate_aux(spa, nvroot, -1ULL,
	VDEV_ALLOC_L2CACHE);
	spa_config_exit(spa, SCL_ALL, FTAG);

	if (props != NULL)
	spa_configfile_set(spa, props, B_FALSE);

	if (error != 0 \|\| (props && spa_writeable(spa) &&
	(error = spa_prop_set(spa, props)))) {
	spa_unload(spa);
	spa_deactivate(spa);
	spa_remove(spa);
	mutex_exit(&spa_namespace_lock);
	return (error);
	}

	spa_async_resume(spa);

	/*
	* Override any spares and level 2 cache devices as specified by
	* the user, as these may have correct device names/devids, etc.
	*/
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
	&spares, &nspares) == 0) {
	if (spa->spa_spares.sav_config)
	VERIFY(nvlist_remove(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
	else
	VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_spares(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	spa->spa_spares.sav_sync = B_TRUE;
	}
	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
	&l2cache, &nl2cache) == 0) {
	if (spa->spa_l2cache.sav_config)
	VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
	else
	VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa_load_l2cache(spa);
	spa_config_exit(spa, SCL_ALL, FTAG);
	spa->spa_l2cache.sav_sync = B_TRUE;
	}

	/*
	* Check for any removed devices.
	*/
	if (spa->spa_autoreplace) {
	spa_aux_check_removed(&spa->spa_spares);
	spa_aux_check_removed(&spa->spa_l2cache);
	}

	if (spa_writeable(spa)) {
	/*
	* Update the config cache to include the newly-imported pool.
	*/
	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	}

	/*
	* It's possible that the pool was expanded while it was exported.
	* We kick off an async task to handle this for us.
	*/
	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);

	mutex_exit(&spa_namespace_lock);
	spa_history_log_version(spa, "import");

	return (0);
	}

	nvlist_t *
	spa_tryimport(nvlist_t *tryconfig)
	{
	nvlist_t *config = NULL;
	char *poolname;
	spa_t *spa;
	uint64_t state;
	int error;

	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
	return (NULL);

	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
	return (NULL);

	/*
	* Create and initialize the spa structure.
	*/
	mutex_enter(&spa_namespace_lock);
	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
	spa_activate(spa, FREAD);

	/*
	* Pass off the heavy lifting to spa_load().
	* Pass TRUE for mosconfig because the user-supplied config
	* is actually the one to trust when doing an import.
	*/
	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);

	/*
	* If 'tryconfig' was at least parsable, return the current config.
	*/
	if (spa->spa_root_vdev != NULL) {
	config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
	poolname) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	state) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
	spa->spa_uberblock.ub_timestamp) == 0);
	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
	spa->spa_load_info) == 0);

	/*
	* If the bootfs property exists on this pool then we
	* copy it out so that external consumers can tell which
	* pools are bootable.
	*/
	if ((!error \|\| error == EEXIST) && spa->spa_bootfs) {
	char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);

	/*
	* We have to play games with the name since the
	* pool was opened as TRYIMPORT_NAME.
	*/
	if (dsl_dsobj_to_dsname(spa_name(spa),
	spa->spa_bootfs, tmpname) == 0) {
	char *cp;
	char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);

	cp = strchr(tmpname, '/');
	if (cp == NULL) {
	(void) strlcpy(dsname, tmpname,
	MAXPATHLEN);
	} else {
	(void) snprintf(dsname, MAXPATHLEN,
	"%s/%s", poolname, ++cp);
	}
	VERIFY(nvlist_add_string(config,
	ZPOOL_CONFIG_BOOTFS, dsname) == 0);
	kmem_free(dsname, MAXPATHLEN);
	}
	kmem_free(tmpname, MAXPATHLEN);
	}

	/*
	* Add the list of hot spares and level 2 cache devices.
	*/
	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	spa_add_spares(spa, config);
	spa_add_l2cache(spa, config);
	spa_config_exit(spa, SCL_CONFIG, FTAG);
	}

	spa_unload(spa);
	spa_deactivate(spa);
	spa_remove(spa);
	mutex_exit(&spa_namespace_lock);

	return (config);
	}

	/*
	* Pool export/destroy
	*
	* The act of destroying or exporting a pool is very simple. We make sure there
	* is no more pending I/O and any references to the pool are gone. Then, we
	* update the pool state and sync all the labels to disk, removing the
	* configuration from the cache afterwards. If the 'hardforce' flag is set, then
	* we don't sync the labels or remove the configuration cache.
	*/
	static int
	spa_export_common(char pool, int new_state, nvlist_t *oldconfig,
	boolean_t force, boolean_t hardforce)
	{
	spa_t *spa;

	if (oldconfig)
	*oldconfig = NULL;

	if (!(spa_mode_global & FWRITE))
	return (SET_ERROR(EROFS));

	mutex_enter(&spa_namespace_lock);
	if ((spa = spa_lookup(pool)) == NULL) {
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(ENOENT));
	}

	/*
	* Put a hold on the pool, drop the namespace lock, stop async tasks,
	* reacquire the namespace lock, and see if we can export.
	*/
	spa_open_ref(spa, FTAG);
	mutex_exit(&spa_namespace_lock);
	spa_async_suspend(spa);
	mutex_enter(&spa_namespace_lock);
	spa_close(spa, FTAG);

	/*
	* The pool will be in core if it's openable,
	* in which case we can modify its state.
	*/
	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
	/*
	* Objsets may be open only because they're dirty, so we
	* have to force it to sync before checking spa_refcnt.
	*/
	txg_wait_synced(spa->spa_dsl_pool, 0);
	spa_evicting_os_wait(spa);

	/*
	* A pool cannot be exported or destroyed if there are active
	* references. If we are resetting a pool, allow references by
	* fault injection handlers.
	*/
	if (!spa_refcount_zero(spa) \|\|
	(spa->spa_inject_ref != 0 &&
	new_state != POOL_STATE_UNINITIALIZED)) {
	spa_async_resume(spa);
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EBUSY));
	}

	/*
	* A pool cannot be exported if it has an active shared spare.
	* This is to prevent other pools stealing the active spare
	* from an exported pool. At user's own will, such pool can
	* be forcedly exported.
	*/
	if (!force && new_state == POOL_STATE_EXPORTED &&
	spa_has_active_shared_spare(spa)) {
	spa_async_resume(spa);
	mutex_exit(&spa_namespace_lock);
	return (SET_ERROR(EXDEV));
	}

	/*
	* We want this to be reflected on every label,
	* so mark them all dirty. spa_unload() will do the
	* final sync that pushes these changes out.
	*/
	if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	spa->spa_state = new_state;
	spa->spa_final_txg = spa_last_synced_txg(spa) +
	TXG_DEFER_SIZE + 1;
	vdev_config_dirty(spa->spa_root_vdev);
	spa_config_exit(spa, SCL_ALL, FTAG);
	}
	}

	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);

	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
	spa_unload(spa);
	spa_deactivate(spa);
	}

	if (oldconfig && spa->spa_config)
	VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);

	if (new_state != POOL_STATE_UNINITIALIZED) {
	if (!hardforce)
	spa_config_sync(spa, B_TRUE, B_TRUE);
	spa_remove(spa);
	}
	mutex_exit(&spa_namespace_lock);

	return (0);
	}

	/*
	* Destroy a storage pool.
	*/
	int
	spa_destroy(char *pool)
	{
	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
	B_FALSE, B_FALSE));
	}

	/*
	* Export a storage pool.
	*/
	int
	spa_export(char pool, nvlist_t *oldconfig, boolean_t force,
	boolean_t hardforce)
	{
	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
	force, hardforce));
	}

	/*
	* Similar to spa_export(), this unloads the spa_t without actually removing it
	* from the namespace in any way.
	*/
	int
	spa_reset(char *pool)
	{
	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
	B_FALSE, B_FALSE));
	}

	/*
	* ==========================================================================
	* Device manipulation
	* ==========================================================================
	*/

	/*
	* Add a device to a storage pool.
	*/
	int
	spa_vdev_add(spa_t spa, nvlist_t nvroot)
	{
	uint64_t txg, id;
	int error;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t vd, tvd;
	nvlist_t spares, l2cache;
	uint_t nspares, nl2cache;

	ASSERT(spa_writeable(spa));

	txg = spa_vdev_enter(spa);

	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
	VDEV_ALLOC_ADD)) != 0)
	return (spa_vdev_exit(spa, NULL, txg, error));

	spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */

	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
	&nspares) != 0)
	nspares = 0;

	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
	&nl2cache) != 0)
	nl2cache = 0;

	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
	return (spa_vdev_exit(spa, vd, txg, EINVAL));

	if (vd->vdev_children != 0 &&
	(error = vdev_create(vd, txg, B_FALSE)) != 0)
	return (spa_vdev_exit(spa, vd, txg, error));

	/*
	* We must validate the spares and l2cache devices after checking the
	* children. Otherwise, vdev_inuse() will blindly overwrite the spare.
	*/
	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
	return (spa_vdev_exit(spa, vd, txg, error));

	/*
	* Transfer each new top-level vdev from vd to rvd.
	*/
	for (int c = 0; c < vd->vdev_children; c++) {

	/*
	* Set the vdev id to the first hole, if one exists.
	*/
	for (id = 0; id < rvd->vdev_children; id++) {
	if (rvd->vdev_child[id]->vdev_ishole) {
	vdev_free(rvd->vdev_child[id]);
	break;
	}
	}
	tvd = vd->vdev_child[c];
	vdev_remove_child(vd, tvd);
	tvd->vdev_id = id;
	vdev_add_child(rvd, tvd);
	vdev_config_dirty(tvd);
	}

	if (nspares != 0) {
	spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
	ZPOOL_CONFIG_SPARES);
	spa_load_spares(spa);
	spa->spa_spares.sav_sync = B_TRUE;
	}

	if (nl2cache != 0) {
	spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
	ZPOOL_CONFIG_L2CACHE);
	spa_load_l2cache(spa);
	spa->spa_l2cache.sav_sync = B_TRUE;
	}

	/*
	* We have to be careful when adding new vdevs to an existing pool.
	* If other threads start allocating from these vdevs before we
	* sync the config cache, and we lose power, then upon reboot we may
	* fail to open the pool because there are DVAs that the config cache
	* can't translate. Therefore, we first add the vdevs without
	* initializing metaslabs; sync the config cache (via spa_vdev_exit());
	* and then let spa_config_update() initialize the new metaslabs.
	*
	* spa_load() checks for added-but-not-initialized vdevs, so that
	* if we lose power at any point in this sequence, the remaining
	* steps will be completed the next time we load the pool.
	*/
	(void) spa_vdev_exit(spa, vd, txg, 0);

	mutex_enter(&spa_namespace_lock);
	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	mutex_exit(&spa_namespace_lock);

	return (0);
	}

	/*
	* Attach a device to a mirror. The arguments are the path to any device
	* in the mirror, and the nvroot for the new device. If the path specifies
	* a device that is not mirrored, we automatically insert the mirror vdev.
	*
	* If 'replacing' is specified, the new device is intended to replace the
	* existing device; in this case the two devices are made into their own
	* mirror using the 'replacing' vdev, which is functionally identical to
	* the mirror vdev (it actually reuses all the same ops) but has a few
	* extra rules: you can't attach to it after it's been created, and upon
	* completion of resilvering, the first disk (the one being replaced)
	* is automatically detached.
	*/
	int
	spa_vdev_attach(spa_t spa, uint64_t guid, nvlist_t nvroot, int replacing)
	{
	uint64_t txg, dtl_max_txg;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t oldvd, newvd, newrootvd, pvd, *tvd;
	vdev_ops_t *pvops;
	char oldvdpath, newvdpath;
	int newvd_isspare;
	int error;

	ASSERT(spa_writeable(spa));

	txg = spa_vdev_enter(spa);

	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);

	if (oldvd == NULL)
	return (spa_vdev_exit(spa, NULL, txg, ENODEV));

	if (!oldvd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

	pvd = oldvd->vdev_parent;

	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
	VDEV_ALLOC_ATTACH)) != 0)
	return (spa_vdev_exit(spa, NULL, txg, EINVAL));

	if (newrootvd->vdev_children != 1)
	return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));

	newvd = newrootvd->vdev_child[0];

	if (!newvd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));

	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
	return (spa_vdev_exit(spa, newrootvd, txg, error));

	/*
	* Spares can't replace logs
	*/
	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

	if (!replacing) {
	/*
	* For attach, the only allowable parent is a mirror or the root
	* vdev.
	*/
	if (pvd->vdev_ops != &vdev_mirror_ops &&
	pvd->vdev_ops != &vdev_root_ops)
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

	pvops = &vdev_mirror_ops;
	} else {
	/*
	* Active hot spares can only be replaced by inactive hot
	* spares.
	*/
	if (pvd->vdev_ops == &vdev_spare_ops &&
	oldvd->vdev_isspare &&
	!spa_has_spare(spa, newvd->vdev_guid))
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

	/*
	* If the source is a hot spare, and the parent isn't already a
	* spare, then we want to create a new hot spare. Otherwise, we
	* want to create a replacing vdev. The user is not allowed to
	* attach to a spared vdev child unless the 'isspare' state is
	* the same (spare replaces spare, non-spare replaces
	* non-spare).
	*/
	if (pvd->vdev_ops == &vdev_replacing_ops &&
	spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	} else if (pvd->vdev_ops == &vdev_spare_ops &&
	newvd->vdev_isspare != oldvd->vdev_isspare) {
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
	}

	if (newvd->vdev_isspare)
	pvops = &vdev_spare_ops;
	else
	pvops = &vdev_replacing_ops;
	}

	/*
	* Make sure the new device is big enough.
	*/
	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
	return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));

	/*
	* The new device cannot have a higher alignment requirement
	* than the top-level vdev.
	*/
	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
	return (spa_vdev_exit(spa, newrootvd, txg, EDOM));

	/*
	* If this is an in-place replacement, update oldvd's path and devid
	* to make it distinguishable from newvd, and unopenable from now on.
	*/
	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
	spa_strfree(oldvd->vdev_path);
	oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
	KM_SLEEP);
	(void) sprintf(oldvd->vdev_path, "%s/%s",
	newvd->vdev_path, "old");
	if (oldvd->vdev_devid != NULL) {
	spa_strfree(oldvd->vdev_devid);
	oldvd->vdev_devid = NULL;
	}
	}

	/* mark the device being resilvered */
	newvd->vdev_resilver_txg = txg;

	/*
	* If the parent is not a mirror, or if we're replacing, insert the new
	* mirror/replacing/spare vdev above oldvd.
	*/
	if (pvd->vdev_ops != pvops)
	pvd = vdev_add_parent(oldvd, pvops);

	ASSERT(pvd->vdev_top->vdev_parent == rvd);
	ASSERT(pvd->vdev_ops == pvops);
	ASSERT(oldvd->vdev_parent == pvd);

	/*
	* Extract the new device from its root and add it to pvd.
	*/
	vdev_remove_child(newrootvd, newvd);
	newvd->vdev_id = pvd->vdev_children;
	newvd->vdev_crtxg = oldvd->vdev_crtxg;
	vdev_add_child(pvd, newvd);

	tvd = newvd->vdev_top;
	ASSERT(pvd->vdev_top == tvd);
	ASSERT(tvd->vdev_parent == rvd);

	vdev_config_dirty(tvd);

	/*
	* Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
	* for any dmu_sync-ed blocks. It will propagate upward when
	* spa_vdev_exit() calls vdev_dtl_reassess().
	*/
	dtl_max_txg = txg + TXG_CONCURRENT_STATES;

	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
	dtl_max_txg - TXG_INITIAL);

	if (newvd->vdev_isspare) {
	spa_spare_activate(newvd);
	spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
	}

	oldvdpath = spa_strdup(oldvd->vdev_path);
	newvdpath = spa_strdup(newvd->vdev_path);
	newvd_isspare = newvd->vdev_isspare;

	/*
	* Mark newvd's DTL dirty in this txg.
	*/
	vdev_dirty(tvd, VDD_DTL, newvd, txg);

	/*
	* Schedule the resilver to restart in the future. We do this to
	* ensure that dmu_sync-ed blocks have been stitched into the
	* respective datasets.
	*/
	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);

	/*
	* Commit the config
	*/
	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);

	spa_history_log_internal(spa, "vdev attach", NULL,
	"%s vdev=%s %s vdev=%s",
	replacing && newvd_isspare ? "spare in" :
	replacing ? "replace" : "attach", newvdpath,
	replacing ? "for" : "to", oldvdpath);

	spa_strfree(oldvdpath);
	spa_strfree(newvdpath);

	if (spa->spa_bootfs)
	spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);

	return (0);
	}

	/*
	* Detach a device from a mirror or replacing vdev.
	*
	* If 'replace_done' is specified, only detach if the parent
	* is a replacing vdev.
	*/
	int
	spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
	{
	uint64_t txg;
	int error;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t vd, pvd, cvd, tvd;
	boolean_t unspare = B_FALSE;
	uint64_t unspare_guid = 0;
	char *vdpath;

	ASSERT(spa_writeable(spa));

	txg = spa_vdev_enter(spa);

	vd = spa_lookup_by_guid(spa, guid, B_FALSE);

	if (vd == NULL)
	return (spa_vdev_exit(spa, NULL, txg, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

	pvd = vd->vdev_parent;

	/*
	* If the parent/child relationship is not as expected, don't do it.
	* Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
	* vdev that's replacing B with C. The user's intent in replacing
	* is to go from M(A,B) to M(A,C). If the user decides to cancel
	* the replace by detaching C, the expected behavior is to end up
	* M(A,B). But suppose that right after deciding to detach C,
	* the replacement of B completes. We would have M(A,C), and then
	* ask to detach C, which would leave us with just A -- not what
	* the user wanted. To prevent this, we make sure that the
	* parent/child relationship hasn't changed -- in this example,
	* that C's parent is still the replacing vdev R.
	*/
	if (pvd->vdev_guid != pguid && pguid != 0)
	return (spa_vdev_exit(spa, NULL, txg, EBUSY));

	/*
	* Only 'replacing' or 'spare' vdevs can be replaced.
	*/
	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
	pvd->vdev_ops != &vdev_spare_ops)
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

	ASSERT(pvd->vdev_ops != &vdev_spare_ops \|\|
	spa_version(spa) >= SPA_VERSION_SPARES);

	/*
	* Only mirror, replacing, and spare vdevs support detach.
	*/
	if (pvd->vdev_ops != &vdev_replacing_ops &&
	pvd->vdev_ops != &vdev_mirror_ops &&
	pvd->vdev_ops != &vdev_spare_ops)
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));

	/*
	* If this device has the only valid copy of some data,
	* we cannot safely detach it.
	*/
	if (vdev_dtl_required(vd))
	return (spa_vdev_exit(spa, NULL, txg, EBUSY));

	ASSERT(pvd->vdev_children >= 2);

	/*
	* If we are detaching the second disk from a replacing vdev, then
	* check to see if we changed the original vdev's path to have "/old"
	* at the end in spa_vdev_attach(). If so, undo that change now.
	*/
	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
	vd->vdev_path != NULL) {
	size_t len = strlen(vd->vdev_path);

	for (int c = 0; c < pvd->vdev_children; c++) {
	cvd = pvd->vdev_child[c];

	if (cvd == vd \|\| cvd->vdev_path == NULL)
	continue;

	if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
	strcmp(cvd->vdev_path + len, "/old") == 0) {
	spa_strfree(cvd->vdev_path);
	cvd->vdev_path = spa_strdup(vd->vdev_path);
	break;
	}
	}
	}

	/*
	* If we are detaching the original disk from a spare, then it implies
	* that the spare should become a real disk, and be removed from the
	* active spare list for the pool.
	*/
	if (pvd->vdev_ops == &vdev_spare_ops &&
	vd->vdev_id == 0 &&
	pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
	unspare = B_TRUE;

	/*
	* Erase the disk labels so the disk can be used for other things.
	* This must be done after all other error cases are handled,
	* but before we disembowel vd (so we can still do I/O to it).
	* But if we can't do it, don't treat the error as fatal --
	* it may be that the unwritability of the disk is the reason
	* it's being detached!
	*/
	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);

	/*
	* Remove vd from its parent and compact the parent's children.
	*/
	vdev_remove_child(pvd, vd);
	vdev_compact_children(pvd);

	/*
	* Remember one of the remaining children so we can get tvd below.
	*/
	cvd = pvd->vdev_child[pvd->vdev_children - 1];

	/*
	* If we need to remove the remaining child from the list of hot spares,
	* do it now, marking the vdev as no longer a spare in the process.
	* We must do this before vdev_remove_parent(), because that can
	* change the GUID if it creates a new toplevel GUID. For a similar
	* reason, we must remove the spare now, in the same txg as the detach;
	* otherwise someone could attach a new sibling, change the GUID, and
	* the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
	*/
	if (unspare) {
	ASSERT(cvd->vdev_isspare);
	spa_spare_remove(cvd);
	unspare_guid = cvd->vdev_guid;
	(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
	cvd->vdev_unspare = B_TRUE;
	}

	/*
	* If the parent mirror/replacing vdev only has one child,
	* the parent is no longer needed. Remove it from the tree.
	*/
	if (pvd->vdev_children == 1) {
	if (pvd->vdev_ops == &vdev_spare_ops)
	cvd->vdev_unspare = B_FALSE;
	vdev_remove_parent(cvd);
	}


	/*
	* We don't set tvd until now because the parent we just removed
	* may have been the previous top-level vdev.
	*/
	tvd = cvd->vdev_top;
	ASSERT(tvd->vdev_parent == rvd);

	/*
	* Reevaluate the parent vdev state.
	*/
	vdev_propagate_state(cvd);

	/*
	* If the 'autoexpand' property is set on the pool then automatically
	* try to expand the size of the pool. For example if the device we
	* just detached was smaller than the others, it may be possible to
	* add metaslabs (i.e. grow the pool). We need to reopen the vdev
	* first so that we can obtain the updated sizes of the leaf vdevs.
	*/
	if (spa->spa_autoexpand) {
	vdev_reopen(tvd);
	vdev_expand(tvd, txg);
	}

	vdev_config_dirty(tvd);

	/*
	* Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
	* vd->vdev_detached is set and free vd's DTL object in syncing context.
	* But first make sure we're not on any other txg's DTL list, to
	* prevent vd from being accessed after it's freed.
	*/
	vdpath = spa_strdup(vd->vdev_path);
	for (int t = 0; t < TXG_SIZE; t++)
	(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
	vd->vdev_detached = B_TRUE;
	vdev_dirty(tvd, VDD_DTL, vd, txg);

	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);

	/* hang on to the spa before we release the lock */
	spa_open_ref(spa, FTAG);

	error = spa_vdev_exit(spa, vd, txg, 0);

	spa_history_log_internal(spa, "detach", NULL,
	"vdev=%s", vdpath);
	spa_strfree(vdpath);

	/*
	* If this was the removal of the original device in a hot spare vdev,
	* then we want to go through and remove the device from the hot spare
	* list of every other pool.
	*/
	if (unspare) {
	spa_t *altspa = NULL;

	mutex_enter(&spa_namespace_lock);
	while ((altspa = spa_next(altspa)) != NULL) {
	if (altspa->spa_state != POOL_STATE_ACTIVE \|\|
	altspa == spa)
	continue;

	spa_open_ref(altspa, FTAG);
	mutex_exit(&spa_namespace_lock);
	(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
	mutex_enter(&spa_namespace_lock);
	spa_close(altspa, FTAG);
	}
	mutex_exit(&spa_namespace_lock);

	/* search the rest of the vdevs for spares to remove */
	spa_vdev_resilver_done(spa);
	}

	/* all done with the spa; OK to release */
	mutex_enter(&spa_namespace_lock);
	spa_close(spa, FTAG);
	mutex_exit(&spa_namespace_lock);

	return (error);
	}

	/*
	* Split a set of devices from their mirrors, and create a new pool from them.
	*/
	int
	spa_vdev_split_mirror(spa_t spa, char newname, nvlist_t *config,
	nvlist_t *props, boolean_t exp)
	{
	int error = 0;
	uint64_t txg, *glist;
	spa_t *newspa;
	uint_t c, children, lastlog;
	nvlist_t *child, nvl, *tmp;
	dmu_tx_t *tx;
	char *altroot = NULL;
	vdev_t rvd, vml = NULL; / vdev modify list */
	boolean_t activate_slog;

	ASSERT(spa_writeable(spa));

	txg = spa_vdev_enter(spa);

	/* clear the log and flush everything up to now */
	activate_slog = spa_passivate_log(spa);
	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
	error = spa_offline_log(spa);
	txg = spa_vdev_config_enter(spa);

	if (activate_slog)
	spa_activate_log(spa);

	if (error != 0)
	return (spa_vdev_exit(spa, NULL, txg, error));

	/* check new spa name before going any further */
	if (spa_lookup(newname) != NULL)
	return (spa_vdev_exit(spa, NULL, txg, EEXIST));

	/*
	* scan through all the children to ensure they're all mirrors
	*/
	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 \|\|
	nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
	&children) != 0)
	return (spa_vdev_exit(spa, NULL, txg, EINVAL));

	/* first, check to ensure we've got the right child count */
	rvd = spa->spa_root_vdev;
	lastlog = 0;
	for (c = 0; c < rvd->vdev_children; c++) {
	vdev_t *vd = rvd->vdev_child[c];

	/* don't count the holes & logs as children */
	if (vd->vdev_islog \|\| vd->vdev_ishole) {
	if (lastlog == 0)
	lastlog = c;
	continue;
	}

	lastlog = 0;
	}
	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
	return (spa_vdev_exit(spa, NULL, txg, EINVAL));

	/* next, ensure no spare or cache devices are part of the split */
	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 \|\|
	nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
	return (spa_vdev_exit(spa, NULL, txg, EINVAL));

	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);

	/* then, loop over each vdev and validate it */
	for (c = 0; c < children; c++) {
	uint64_t is_hole = 0;

	(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
	&is_hole);

	if (is_hole != 0) {
	if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole \|\|
	spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
	continue;
	} else {
	error = SET_ERROR(EINVAL);
	break;
	}
	}

	/* which disk is going to be split? */
	if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
	&glist[c]) != 0) {
	error = SET_ERROR(EINVAL);
	break;
	}

	/* look it up in the spa */
	vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
	if (vml[c] == NULL) {
	error = SET_ERROR(ENODEV);
	break;
	}

	/* make sure there's nothing stopping the split */
	if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops \|\|
	vml[c]->vdev_islog \|\|
	vml[c]->vdev_ishole \|\|
	vml[c]->vdev_isspare \|\|
	vml[c]->vdev_isl2cache \|\|
	!vdev_writeable(vml[c]) \|\|
	vml[c]->vdev_children != 0 \|\|
	vml[c]->vdev_state != VDEV_STATE_HEALTHY \|\|
	c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
	error = SET_ERROR(EINVAL);
	break;
	}

	if (vdev_dtl_required(vml[c])) {
	error = SET_ERROR(EBUSY);
	break;
	}

	/* we need certain info from the top level */
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
	vml[c]->vdev_top->vdev_ms_array) == 0);
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
	vml[c]->vdev_top->vdev_ms_shift) == 0);
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
	vml[c]->vdev_top->vdev_asize) == 0);
	VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
	vml[c]->vdev_top->vdev_ashift) == 0);
	}

	if (error != 0) {
	kmem_free(vml, children * sizeof (vdev_t *));
	kmem_free(glist, children * sizeof (uint64_t));
	return (spa_vdev_exit(spa, NULL, txg, error));
	}

	/* stop writers from using the disks */
	for (c = 0; c < children; c++) {
	if (vml[c] != NULL)
	vml[c]->vdev_offline = B_TRUE;
	}
	vdev_reopen(spa->spa_root_vdev);

	/*
	* Temporarily record the splitting vdevs in the spa config. This
	* will disappear once the config is regenerated.
	*/
	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
	glist, children) == 0);
	kmem_free(glist, children * sizeof (uint64_t));

	mutex_enter(&spa->spa_props_lock);
	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
	nvl) == 0);
	mutex_exit(&spa->spa_props_lock);
	spa->spa_config_splitting = nvl;
	vdev_config_dirty(spa->spa_root_vdev);

	/* configure and create the new pool */
	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
	exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
	spa_version(spa)) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
	spa->spa_config_txg) == 0);
	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
	spa_generate_guid(NULL)) == 0);
	(void) nvlist_lookup_string(props,
	zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);

	/* add the new pool to the namespace */
	newspa = spa_add(newname, config, altroot);
	newspa->spa_config_txg = spa->spa_config_txg;
	spa_set_log_state(newspa, SPA_LOG_CLEAR);

	/* release the spa config lock, retaining the namespace lock */
	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);

	if (zio_injection_enabled)
	zio_handle_panic_injection(spa, FTAG, 1);

	spa_activate(newspa, spa_mode_global);
	spa_async_suspend(newspa);

	/* create the new pool from the disks of the original pool */
	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
	if (error)
	goto out;

	/* if that worked, generate a real config for the new pool */
	if (newspa->spa_root_vdev != NULL) {
	VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
	NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
	ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
	spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
	B_TRUE));
	}

	/* set the props */
	if (props != NULL) {
	spa_configfile_set(newspa, props, B_FALSE);
	error = spa_prop_set(newspa, props);
	if (error)
	goto out;
	}

	/* flush everything */
	txg = spa_vdev_config_enter(newspa);
	vdev_config_dirty(newspa->spa_root_vdev);
	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);

	if (zio_injection_enabled)
	zio_handle_panic_injection(spa, FTAG, 2);

	spa_async_resume(newspa);

	/* finally, update the original pool's config */
	txg = spa_vdev_config_enter(spa);
	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error != 0)
	dmu_tx_abort(tx);
	for (c = 0; c < children; c++) {
	if (vml[c] != NULL) {
	vdev_split(vml[c]);
	if (error == 0)
	spa_history_log_internal(spa, "detach", tx,
	"vdev=%s", vml[c]->vdev_path);
	vdev_free(vml[c]);
	}
	}
	vdev_config_dirty(spa->spa_root_vdev);
	spa->spa_config_splitting = NULL;
	nvlist_free(nvl);
	if (error == 0)
	dmu_tx_commit(tx);
	(void) spa_vdev_exit(spa, NULL, txg, 0);

	if (zio_injection_enabled)
	zio_handle_panic_injection(spa, FTAG, 3);

	/* split is complete; log a history record */
	spa_history_log_internal(newspa, "split", NULL,
	"from pool %s", spa_name(spa));

	kmem_free(vml, children * sizeof (vdev_t *));

	/* if we're not going to mount the filesystems in userland, export */
	if (exp)
	error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
	B_FALSE, B_FALSE);

	return (error);

	out:
	spa_unload(newspa);
	spa_deactivate(newspa);
	spa_remove(newspa);

	txg = spa_vdev_config_enter(spa);

	/* re-online all offlined disks */
	for (c = 0; c < children; c++) {
	if (vml[c] != NULL)
	vml[c]->vdev_offline = B_FALSE;
	}
	vdev_reopen(spa->spa_root_vdev);

	nvlist_free(spa->spa_config_splitting);
	spa->spa_config_splitting = NULL;
	(void) spa_vdev_exit(spa, NULL, txg, error);

	kmem_free(vml, children * sizeof (vdev_t *));
	return (error);
	}

	static nvlist_t *
	spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
	{
	for (int i = 0; i < count; i++) {
	uint64_t guid;

	VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
	&guid) == 0);

	if (guid == target_guid)
	return (nvpp[i]);
	}

	return (NULL);
	}

	static void
	spa_vdev_remove_aux(nvlist_t config, char name, nvlist_t **dev, int count,
	nvlist_t *dev_to_remove)
	{
	nvlist_t **newdev = NULL;

	if (count > 1)
	newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);

	for (int i = 0, j = 0; i < count; i++) {
	if (dev[i] == dev_to_remove)
	continue;
	VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
	}

	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);

	for (int i = 0; i < count - 1; i++)
	nvlist_free(newdev[i]);

	if (count > 1)
	kmem_free(newdev, (count - 1) * sizeof (void *));
	}

	/*
	* Evacuate the device.
	*/
	static int
	spa_vdev_remove_evacuate(spa_t spa, vdev_t vd)
	{
	uint64_t txg;
	int error = 0;

	ASSERT(MUTEX_HELD(&spa_namespace_lock));
	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
	ASSERT(vd == vd->vdev_top);

	/*
	* Evacuate the device. We don't hold the config lock as writer
	* since we need to do I/O but we do keep the
	* spa_namespace_lock held. Once this completes the device
	* should no longer have any blocks allocated on it.
	*/
	if (vd->vdev_islog) {
	if (vd->vdev_stat.vs_alloc != 0)
	error = spa_offline_log(spa);
	} else {
	error = SET_ERROR(ENOTSUP);
	}

	if (error)
	return (error);

	/*
	* The evacuation succeeded. Remove any remaining MOS metadata
	* associated with this vdev, and wait for these changes to sync.
	*/
	ASSERT0(vd->vdev_stat.vs_alloc);
	txg = spa_vdev_config_enter(spa);
	vd->vdev_removing = B_TRUE;
	vdev_dirty_leaves(vd, VDD_DTL, txg);
	vdev_config_dirty(vd);
	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);

	return (0);
	}

	/*
	* Complete the removal by cleaning up the namespace.
	*/
	static void
	spa_vdev_remove_from_namespace(spa_t spa, vdev_t vd)
	{
	vdev_t *rvd = spa->spa_root_vdev;
	uint64_t id = vd->vdev_id;
	boolean_t last_vdev = (id == (rvd->vdev_children - 1));

	ASSERT(MUTEX_HELD(&spa_namespace_lock));
	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	ASSERT(vd == vd->vdev_top);

	/*
	* Only remove any devices which are empty.
	*/
	if (vd->vdev_stat.vs_alloc != 0)
	return;

	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);

	if (list_link_active(&vd->vdev_state_dirty_node))
	vdev_state_clean(vd);
	if (list_link_active(&vd->vdev_config_dirty_node))
	vdev_config_clean(vd);

	vdev_free(vd);

	if (last_vdev) {
	vdev_compact_children(rvd);
	} else {
	vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
	vdev_add_child(rvd, vd);
	}
	vdev_config_dirty(rvd);

	/*
	* Reassess the health of our root vdev.
	*/
	vdev_reopen(rvd);
	}

	/*
	* Remove a device from the pool -
	*
	* Removing a device from the vdev namespace requires several steps
	* and can take a significant amount of time. As a result we use
	* the spa_vdev_config_[enter/exit] functions which allow us to
	* grab and release the spa_config_lock while still holding the namespace
	* lock. During each step the configuration is synced out.
	*
	* Currently, this supports removing only hot spares, slogs, and level 2 ARC
	* devices.
	*/
	int
	spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
	{
	vdev_t *vd;
	metaslab_group_t *mg;
	nvlist_t spares, l2cache, *nv;
	uint64_t txg = 0;
	uint_t nspares, nl2cache;
	int error = 0;
	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);

	ASSERT(spa_writeable(spa));

	if (!locked)
	txg = spa_vdev_enter(spa);

	vd = spa_lookup_by_guid(spa, guid, B_FALSE);

	if (spa->spa_spares.sav_vdevs != NULL &&
	nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
	(nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
	/*
	* Only remove the hot spare if it's not currently in use
	* in this pool.
	*/
	if (vd == NULL \|\| unspare) {
	spa_vdev_remove_aux(spa->spa_spares.sav_config,
	ZPOOL_CONFIG_SPARES, spares, nspares, nv);
	spa_load_spares(spa);
	spa->spa_spares.sav_sync = B_TRUE;
	} else {
	error = SET_ERROR(EBUSY);
	}
	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
	nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
	(nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
	/*
	* Cache devices can always be removed.
	*/
	spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
	ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
	spa_load_l2cache(spa);
	spa->spa_l2cache.sav_sync = B_TRUE;
	} else if (vd != NULL && vd->vdev_islog) {
	ASSERT(!locked);
	ASSERT(vd == vd->vdev_top);

	mg = vd->vdev_mg;

	/*
	* Stop allocating from this vdev.
	*/
	metaslab_group_passivate(mg);

	/*
	* Wait for the youngest allocations and frees to sync,
	* and then wait for the deferral of those frees to finish.
	*/
	spa_vdev_config_exit(spa, NULL,
	txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);

	/*
	* Attempt to evacuate the vdev.
	*/
	error = spa_vdev_remove_evacuate(spa, vd);

	txg = spa_vdev_config_enter(spa);

	/*
	* If we couldn't evacuate the vdev, unwind.
	*/
	if (error) {
	metaslab_group_activate(mg);
	return (spa_vdev_exit(spa, NULL, txg, error));
	}

	/*
	* Clean up the vdev namespace.
	*/
	spa_vdev_remove_from_namespace(spa, vd);

	} else if (vd != NULL) {
	/*
	* Normal vdevs cannot be removed (yet).
	*/
	error = SET_ERROR(ENOTSUP);
	} else {
	/*
	* There is no vdev of any kind with the specified guid.
	*/
	error = SET_ERROR(ENOENT);
	}

	if (!locked)
	return (spa_vdev_exit(spa, NULL, txg, error));

	return (error);
	}

	/*
	* Find any device that's done replacing, or a vdev marked 'unspare' that's
	* currently spared, so we can detach it.
	*/
	static vdev_t *
	spa_vdev_resilver_done_hunt(vdev_t *vd)
	{
	vdev_t newvd, oldvd;

	for (int c = 0; c < vd->vdev_children; c++) {
	oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
	if (oldvd != NULL)
	return (oldvd);
	}

	/*
	* Check for a completed replacement. We always consider the first
	* vdev in the list to be the oldest vdev, and the last one to be
	* the newest (see spa_vdev_attach() for how that works). In
	* the case where the newest vdev is faulted, we will not automatically
	* remove it after a resilver completes. This is OK as it will require
	* user intervention to determine which disk the admin wishes to keep.
	*/
	if (vd->vdev_ops == &vdev_replacing_ops) {
	ASSERT(vd->vdev_children > 1);

	newvd = vd->vdev_child[vd->vdev_children - 1];
	oldvd = vd->vdev_child[0];

	if (vdev_dtl_empty(newvd, DTL_MISSING) &&
	vdev_dtl_empty(newvd, DTL_OUTAGE) &&
	!vdev_dtl_required(oldvd))
	return (oldvd);
	}

	/*
	* Check for a completed resilver with the 'unspare' flag set.
	*/
	if (vd->vdev_ops == &vdev_spare_ops) {
	vdev_t *first = vd->vdev_child[0];
	vdev_t *last = vd->vdev_child[vd->vdev_children - 1];

	if (last->vdev_unspare) {
	oldvd = first;
	newvd = last;
	} else if (first->vdev_unspare) {
	oldvd = last;
	newvd = first;
	} else {
	oldvd = NULL;
	}

	if (oldvd != NULL &&
	vdev_dtl_empty(newvd, DTL_MISSING) &&
	vdev_dtl_empty(newvd, DTL_OUTAGE) &&
	!vdev_dtl_required(oldvd))
	return (oldvd);

	/*
	* If there are more than two spares attached to a disk,
	* and those spares are not required, then we want to
	* attempt to free them up now so that they can be used
	* by other pools. Once we're back down to a single
	* disk+spare, we stop removing them.
	*/
	if (vd->vdev_children > 2) {
	newvd = vd->vdev_child[1];

	if (newvd->vdev_isspare && last->vdev_isspare &&
	vdev_dtl_empty(last, DTL_MISSING) &&
	vdev_dtl_empty(last, DTL_OUTAGE) &&
	!vdev_dtl_required(newvd))
	return (newvd);
	}
	}

	return (NULL);
	}

	static void
	spa_vdev_resilver_done(spa_t *spa)
	{
	vdev_t vd, pvd, *ppvd;
	uint64_t guid, sguid, pguid, ppguid;

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
	pvd = vd->vdev_parent;
	ppvd = pvd->vdev_parent;
	guid = vd->vdev_guid;
	pguid = pvd->vdev_guid;
	ppguid = ppvd->vdev_guid;
	sguid = 0;
	/*
	* If we have just finished replacing a hot spared device, then
	* we need to detach the parent's first child (the original hot
	* spare) as well.
	*/
	if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
	ppvd->vdev_children == 2) {
	ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
	sguid = ppvd->vdev_child[1]->vdev_guid;
	}
	ASSERT(vd->vdev_resilver_txg == 0 \|\| !vdev_dtl_required(vd));

	spa_config_exit(spa, SCL_ALL, FTAG);
	if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
	return;
	if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
	return;
	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
	}

	spa_config_exit(spa, SCL_ALL, FTAG);
	}

	/*
	* Update the stored path or FRU for this vdev.
	*/
	int
	spa_vdev_set_common(spa_t spa, uint64_t guid, const char value,
	boolean_t ispath)
	{
	vdev_t *vd;
	boolean_t sync = B_FALSE;

	ASSERT(spa_writeable(spa));

	spa_vdev_state_enter(spa, SCL_ALL);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENOENT));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	if (ispath) {
	if (strcmp(value, vd->vdev_path) != 0) {
	spa_strfree(vd->vdev_path);
	vd->vdev_path = spa_strdup(value);
	sync = B_TRUE;
	}
	} else {
	if (vd->vdev_fru == NULL) {
	vd->vdev_fru = spa_strdup(value);
	sync = B_TRUE;
	} else if (strcmp(value, vd->vdev_fru) != 0) {
	spa_strfree(vd->vdev_fru);
	vd->vdev_fru = spa_strdup(value);
	sync = B_TRUE;
	}
	}

	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
	}

	int
	spa_vdev_setpath(spa_t spa, uint64_t guid, const char newpath)
	{
	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
	}

	int
	spa_vdev_setfru(spa_t spa, uint64_t guid, const char newfru)
	{
	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
	}

	/*
	* ==========================================================================
	* SPA Scanning
	* ==========================================================================
	*/

	int
	spa_scan_stop(spa_t *spa)
	{
	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
	if (dsl_scan_resilvering(spa->spa_dsl_pool))
	return (SET_ERROR(EBUSY));
	return (dsl_scan_cancel(spa->spa_dsl_pool));
	}

	int
	spa_scan(spa_t *spa, pool_scan_func_t func)
	{
	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);

	if (func >= POOL_SCAN_FUNCS \|\| func == POOL_SCAN_NONE)
	return (SET_ERROR(ENOTSUP));

	/*
	* If a resilver was requested, but there is no DTL on a
	* writeable leaf device, we have nothing to do.
	*/
	if (func == POOL_SCAN_RESILVER &&
	!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
	return (0);
	}

	return (dsl_scan(spa->spa_dsl_pool, func));
	}

	/*
	* ==========================================================================
	* SPA async task processing
	* ==========================================================================
	*/

	static void
	spa_async_remove(spa_t spa, vdev_t vd)
	{
	if (vd->vdev_remove_wanted) {
	vd->vdev_remove_wanted = B_FALSE;
	vd->vdev_delayed_close = B_FALSE;
	vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);

	/*
	* We want to clear the stats, but we don't want to do a full
	* vdev_clear() as that will cause us to throw away
	* degraded/faulted state as well as attempt to reopen the
	* device, all of which is a waste.
	*/
	vd->vdev_stat.vs_read_errors = 0;
	vd->vdev_stat.vs_write_errors = 0;
	vd->vdev_stat.vs_checksum_errors = 0;

	vdev_state_dirty(vd->vdev_top);
	}

	for (int c = 0; c < vd->vdev_children; c++)
	spa_async_remove(spa, vd->vdev_child[c]);
	}

	static void
	spa_async_probe(spa_t spa, vdev_t vd)
	{
	if (vd->vdev_probe_wanted) {
	vd->vdev_probe_wanted = B_FALSE;
	vdev_reopen(vd); /* vdev_open() does the actual probe */
	}

	for (int c = 0; c < vd->vdev_children; c++)
	spa_async_probe(spa, vd->vdev_child[c]);
	}

	static void
	spa_async_autoexpand(spa_t spa, vdev_t vd)
	{
	sysevent_id_t eid;
	nvlist_t *attr;
	char *physpath;

	if (!spa->spa_autoexpand)
	return;

	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];
	spa_async_autoexpand(spa, cvd);
	}

	if (!vd->vdev_ops->vdev_op_leaf \|\| vd->vdev_physpath == NULL)
	return;

	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);

	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);

	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
	ESC_DEV_DLE, attr, &eid, DDI_SLEEP);

	nvlist_free(attr);
	kmem_free(physpath, MAXPATHLEN);
	}

	static void
	spa_async_thread(spa_t *spa)
	{
	int tasks;

	ASSERT(spa->spa_sync_on);

	mutex_enter(&spa->spa_async_lock);
	tasks = spa->spa_async_tasks;
	spa->spa_async_tasks = 0;
	mutex_exit(&spa->spa_async_lock);

	/*
	* See if the config needs to be updated.
	*/
	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
	uint64_t old_space, new_space;

	mutex_enter(&spa_namespace_lock);
	old_space = metaslab_class_get_space(spa_normal_class(spa));
	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
	new_space = metaslab_class_get_space(spa_normal_class(spa));
	mutex_exit(&spa_namespace_lock);

	/*
	* If the pool grew as a result of the config update,
	* then log an internal history event.
	*/
	if (new_space != old_space) {
	spa_history_log_internal(spa, "vdev online", NULL,
	"pool '%s' size: %llu(+%llu)",
	spa_name(spa), new_space, new_space - old_space);
	}
	}

	/*
	* See if any devices need to be marked REMOVED.
	*/
	if (tasks & SPA_ASYNC_REMOVE) {
	spa_vdev_state_enter(spa, SCL_NONE);
	spa_async_remove(spa, spa->spa_root_vdev);
	for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
	spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
	for (int i = 0; i < spa->spa_spares.sav_count; i++)
	spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
	(void) spa_vdev_state_exit(spa, NULL, 0);
	}

	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
	spa_async_autoexpand(spa, spa->spa_root_vdev);
	spa_config_exit(spa, SCL_CONFIG, FTAG);
	}

	/*
	* See if any devices need to be probed.
	*/
	if (tasks & SPA_ASYNC_PROBE) {
	spa_vdev_state_enter(spa, SCL_NONE);
	spa_async_probe(spa, spa->spa_root_vdev);
	(void) spa_vdev_state_exit(spa, NULL, 0);
	}

	/*
	* If any devices are done replacing, detach them.
	*/
	if (tasks & SPA_ASYNC_RESILVER_DONE)
	spa_vdev_resilver_done(spa);

	/*
	* Kick off a resilver.
	*/
	if (tasks & SPA_ASYNC_RESILVER)
	dsl_resilver_restart(spa->spa_dsl_pool, 0);

	/*
	* Let the world know that we're done.
	*/
	mutex_enter(&spa->spa_async_lock);
	spa->spa_async_thread = NULL;
	cv_broadcast(&spa->spa_async_cv);
	mutex_exit(&spa->spa_async_lock);
	thread_exit();
	}

	void
	spa_async_suspend(spa_t *spa)
	{
	mutex_enter(&spa->spa_async_lock);
	spa->spa_async_suspended++;
	while (spa->spa_async_thread != NULL)
	cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
	mutex_exit(&spa->spa_async_lock);
	}

	void
	spa_async_resume(spa_t *spa)
	{
	mutex_enter(&spa->spa_async_lock);
	ASSERT(spa->spa_async_suspended != 0);
	spa->spa_async_suspended--;
	mutex_exit(&spa->spa_async_lock);
	}

	static boolean_t
	spa_async_tasks_pending(spa_t *spa)
	{
	uint_t non_config_tasks;
	uint_t config_task;
	boolean_t config_task_suspended;

	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
	if (spa->spa_ccw_fail_time == 0) {
	config_task_suspended = B_FALSE;
	} else {
	config_task_suspended =
	(gethrtime() - spa->spa_ccw_fail_time) <
	(zfs_ccw_retry_interval * NANOSEC);
	}

	return (non_config_tasks \|\| (config_task && !config_task_suspended));
	}

	static void
	spa_async_dispatch(spa_t *spa)
	{
	mutex_enter(&spa->spa_async_lock);
	if (spa_async_tasks_pending(spa) &&
	!spa->spa_async_suspended &&
	spa->spa_async_thread == NULL &&
	rootdir != NULL)
	spa->spa_async_thread = thread_create(NULL, 0,
	spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
	mutex_exit(&spa->spa_async_lock);
	}

	void
	spa_async_request(spa_t *spa, int task)
	{
	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
	mutex_enter(&spa->spa_async_lock);
	spa->spa_async_tasks \|= task;
	mutex_exit(&spa->spa_async_lock);
	}

	/*
	* ==========================================================================
	* SPA syncing routines
	* ==========================================================================
	*/

	static int
	bpobj_enqueue_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	bpobj_t *bpo = arg;
	bpobj_enqueue(bpo, bp, tx);
	return (0);
	}

	static int
	spa_free_sync_cb(void arg, const blkptr_t bp, dmu_tx_t *tx)
	{
	zio_t *zio = arg;

	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
	zio->io_flags));
	return (0);
	}

	/*
	* Note: this simple function is not inlined to make it easier to dtrace the
	* amount of time spent syncing frees.
	*/
	static void
	spa_sync_frees(spa_t spa, bplist_t bpl, dmu_tx_t *tx)
	{
	zio_t *zio = zio_root(spa, NULL, NULL, 0);
	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
	VERIFY(zio_wait(zio) == 0);
	}

	/*
	* Note: this simple function is not inlined to make it easier to dtrace the
	* amount of time spent syncing deferred frees.
	*/
	static void
	spa_sync_deferred_frees(spa_t spa, dmu_tx_t tx)
	{
	zio_t *zio = zio_root(spa, NULL, NULL, 0);
	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
	spa_free_sync_cb, zio, tx), ==, 0);
	VERIFY0(zio_wait(zio));
	}


	static void
	spa_sync_nvlist(spa_t spa, uint64_t obj, nvlist_t nv, dmu_tx_t *tx)
	{
	char *packed = NULL;
	size_t bufsize;
	size_t nvsize = 0;
	dmu_buf_t *db;

	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);

	/*
	* Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
	* information. This avoids the dmu_buf_will_dirty() path and
	* saves us a pre-read to get data we don't actually care about.
	*/
	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
	packed = kmem_alloc(bufsize, KM_SLEEP);

	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
	KM_SLEEP) == 0);
	bzero(packed + nvsize, bufsize - nvsize);

	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);

	kmem_free(packed, bufsize);

	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
	dmu_buf_will_dirty(db, tx);
	(uint64_t )db->db_data = nvsize;
	dmu_buf_rele(db, FTAG);
	}

	static void
	spa_sync_aux_dev(spa_t spa, spa_aux_vdev_t sav, dmu_tx_t *tx,
	const char config, const char entry)
	{
	nvlist_t *nvroot;
	nvlist_t **list;
	int i;

	if (!sav->sav_sync)
	return;

	/*
	* Update the MOS nvlist describing the list of available devices.
	* spa_validate_aux() will have already made sure this nvlist is
	* valid and the vdevs are labeled appropriately.
	*/
	if (sav->sav_object == 0) {
	sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
	DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
	sizeof (uint64_t), tx);
	VERIFY(zap_update(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
	&sav->sav_object, tx) == 0);
	}

	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
	if (sav->sav_count == 0) {
	VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
	} else {
	list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
	for (i = 0; i < sav->sav_count; i++)
	list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
	B_FALSE, VDEV_CONFIG_L2CACHE);
	VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
	sav->sav_count) == 0);
	for (i = 0; i < sav->sav_count; i++)
	nvlist_free(list[i]);
	kmem_free(list, sav->sav_count * sizeof (void *));
	}

	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
	nvlist_free(nvroot);

	sav->sav_sync = B_FALSE;
	}

	static void
	spa_sync_config_object(spa_t spa, dmu_tx_t tx)
	{
	nvlist_t *config;

	if (list_is_empty(&spa->spa_config_dirty_list))
	return;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

	config = spa_config_generate(spa, spa->spa_root_vdev,
	dmu_tx_get_txg(tx), B_FALSE);

	/*
	* If we're upgrading the spa version then make sure that
	* the config object gets updated with the correct version.
	*/
	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
	spa->spa_uberblock.ub_version);

	spa_config_exit(spa, SCL_STATE, FTAG);

	if (spa->spa_config_syncing)
	nvlist_free(spa->spa_config_syncing);
	spa->spa_config_syncing = config;

	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
	}

	static void
	spa_sync_version(void arg, dmu_tx_t tx)
	{
	uint64_t *versionp = arg;
	uint64_t version = *versionp;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;

	/*
	* Setting the version is special cased when first creating the pool.
	*/
	ASSERT(tx->tx_txg != TXG_INITIAL);

	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
	ASSERT(version >= spa_version(spa));

	spa->spa_uberblock.ub_version = version;
	vdev_config_dirty(spa->spa_root_vdev);
	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
	}

	/*
	* Set zpool properties.
	*/
	static void
	spa_sync_props(void arg, dmu_tx_t tx)
	{
	nvlist_t *nvp = arg;
	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
	objset_t *mos = spa->spa_meta_objset;
	nvpair_t *elem = NULL;

	mutex_enter(&spa->spa_props_lock);

	while ((elem = nvlist_next_nvpair(nvp, elem))) {
	uint64_t intval;
	char strval, fname;
	zpool_prop_t prop;
	const char *propname;
	zprop_type_t proptype;
	spa_feature_t fid;

	switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
	case ZPROP_INVAL:
	/*
	* We checked this earlier in spa_prop_validate().
	*/
	ASSERT(zpool_prop_feature(nvpair_name(elem)));

	fname = strchr(nvpair_name(elem), '@') + 1;
	VERIFY0(zfeature_lookup_name(fname, &fid));

	spa_feature_enable(spa, fid, tx);
	spa_history_log_internal(spa, "set", tx,
	"%s=enabled", nvpair_name(elem));
	break;

	case ZPOOL_PROP_VERSION:
	intval = fnvpair_value_uint64(elem);
	/*
	* The version is synced seperatly before other
	* properties and should be correct by now.
	*/
	ASSERT3U(spa_version(spa), >=, intval);
	break;

	case ZPOOL_PROP_ALTROOT:
	/*
	* 'altroot' is a non-persistent property. It should
	* have been set temporarily at creation or import time.
	*/
	ASSERT(spa->spa_root != NULL);
	break;

	case ZPOOL_PROP_READONLY:
	case ZPOOL_PROP_CACHEFILE:
	/*
	* 'readonly' and 'cachefile' are also non-persisitent
	* properties.
	*/
	break;
	case ZPOOL_PROP_COMMENT:
	strval = fnvpair_value_string(elem);
	if (spa->spa_comment != NULL)
	spa_strfree(spa->spa_comment);
	spa->spa_comment = spa_strdup(strval);
	/*
	* We need to dirty the configuration on all the vdevs
	* so that their labels get updated. It's unnecessary
	* to do this for pool creation since the vdev's
	* configuratoin has already been dirtied.
	*/
	if (tx->tx_txg != TXG_INITIAL)
	vdev_config_dirty(spa->spa_root_vdev);
	spa_history_log_internal(spa, "set", tx,
	"%s=%s", nvpair_name(elem), strval);
	break;
	default:
	/*
	* Set pool property values in the poolprops mos object.
	*/
	if (spa->spa_pool_props_object == 0) {
	spa->spa_pool_props_object =
	zap_create_link(mos, DMU_OT_POOL_PROPS,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
	tx);
	}

	/* normalize the property name */
	propname = zpool_prop_to_name(prop);
	proptype = zpool_prop_get_type(prop);

	if (nvpair_type(elem) == DATA_TYPE_STRING) {
	ASSERT(proptype == PROP_TYPE_STRING);
	strval = fnvpair_value_string(elem);
	VERIFY0(zap_update(mos,
	spa->spa_pool_props_object, propname,
	1, strlen(strval) + 1, strval, tx));
	spa_history_log_internal(spa, "set", tx,
	"%s=%s", nvpair_name(elem), strval);
	} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
	intval = fnvpair_value_uint64(elem);

	if (proptype == PROP_TYPE_INDEX) {
	const char *unused;
	VERIFY0(zpool_prop_index_to_string(
	prop, intval, &unused));
	}
	VERIFY0(zap_update(mos,
	spa->spa_pool_props_object, propname,
	8, 1, &intval, tx));
	spa_history_log_internal(spa, "set", tx,
	"%s=%lld", nvpair_name(elem), intval);
	} else {
	ASSERT(0); /* not allowed */
	}

	switch (prop) {
	case ZPOOL_PROP_DELEGATION:
	spa->spa_delegation = intval;
	break;
	case ZPOOL_PROP_BOOTFS:
	spa->spa_bootfs = intval;
	break;
	case ZPOOL_PROP_FAILUREMODE:
	spa->spa_failmode = intval;
	break;
	case ZPOOL_PROP_AUTOEXPAND:
	spa->spa_autoexpand = intval;
	if (tx->tx_txg != TXG_INITIAL)
	spa_async_request(spa,
	SPA_ASYNC_AUTOEXPAND);
	break;
	case ZPOOL_PROP_DEDUPDITTO:
	spa->spa_dedup_ditto = intval;
	break;
	default:
	break;
	}
	}

	}

	mutex_exit(&spa->spa_props_lock);
	}

	/*
	* Perform one-time upgrade on-disk changes. spa_version() does not
	* reflect the new version this txg, so there must be no changes this
	* txg to anything that the upgrade code depends on after it executes.
	* Therefore this must be called after dsl_pool_sync() does the sync
	* tasks.
	*/
	static void
	spa_sync_upgrades(spa_t spa, dmu_tx_t tx)
	{
	dsl_pool_t *dp = spa->spa_dsl_pool;

	ASSERT(spa->spa_sync_pass == 1);

	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);

	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
	dsl_pool_create_origin(dp, tx);

	/* Keeping the origin open increases spa_minref */
	spa->spa_minref += 3;
	}

	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
	dsl_pool_upgrade_clones(dp, tx);
	}

	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
	dsl_pool_upgrade_dir_clones(dp, tx);

	/* Keeping the freedir open increases spa_minref */
	spa->spa_minref += 3;
	}

	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
	spa_feature_create_zap_objects(spa, tx);
	}

	/*
	* LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
	* when possibility to use lz4 compression for metadata was added
	* Old pools that have this feature enabled must be upgraded to have
	* this feature active
	*/
	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
	boolean_t lz4_en = spa_feature_is_enabled(spa,
	SPA_FEATURE_LZ4_COMPRESS);
	boolean_t lz4_ac = spa_feature_is_active(spa,
	SPA_FEATURE_LZ4_COMPRESS);

	if (lz4_en && !lz4_ac)
	spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
	}
	rrw_exit(&dp->dp_config_rwlock, FTAG);
	}

	/*
	* Sync the specified transaction group. New blocks may be dirtied as
	* part of the process, so we iterate until it converges.
	*/
	void
	spa_sync(spa_t *spa, uint64_t txg)
	{
	dsl_pool_t *dp = spa->spa_dsl_pool;
	objset_t *mos = spa->spa_meta_objset;
	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t *vd;
	dmu_tx_t *tx;
	int error;

	VERIFY(spa_writeable(spa));

	/*
	* Lock out configuration changes.
	*/
	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

	spa->spa_syncing_txg = txg;
	spa->spa_sync_pass = 0;

	/*
	* If there are any pending vdev state changes, convert them
	* into config changes that go out with this transaction group.
	*/
	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	while (list_head(&spa->spa_state_dirty_list) != NULL) {
	/*
	* We need the write lock here because, for aux vdevs,
	* calling vdev_config_dirty() modifies sav_config.
	* This is ugly and will become unnecessary when we
	* eliminate the aux vdev wart by integrating all vdevs
	* into the root vdev tree.
	*/
	spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_WRITER);
	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
	vdev_state_clean(vd);
	vdev_config_dirty(vd);
	}
	spa_config_exit(spa, SCL_CONFIG \| SCL_STATE, FTAG);
	spa_config_enter(spa, SCL_CONFIG \| SCL_STATE, FTAG, RW_READER);
	}
	spa_config_exit(spa, SCL_STATE, FTAG);

	tx = dmu_tx_create_assigned(dp, txg);

	spa->spa_sync_starttime = gethrtime();
	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
	spa->spa_sync_starttime + spa->spa_deadman_synctime));

	/*
	* If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
	* set spa_deflate if we have no raid-z vdevs.
	*/
	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
	spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
	int i;

	for (i = 0; i < rvd->vdev_children; i++) {
	vd = rvd->vdev_child[i];
	if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
	break;
	}
	if (i == rvd->vdev_children) {
	spa->spa_deflate = TRUE;
	VERIFY(0 == zap_add(spa->spa_meta_objset,
	DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
	sizeof (uint64_t), 1, &spa->spa_deflate, tx));
	}
	}

	/*
	* Iterate to convergence.
	*/
	do {
	int pass = ++spa->spa_sync_pass;

	spa_sync_config_object(spa, tx);
	spa_sync_aux_dev(spa, &spa->spa_spares, tx,
	ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
	spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
	ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
	spa_errlog_sync(spa, txg);
	dsl_pool_sync(dp, txg);

	if (pass < zfs_sync_pass_deferred_free) {
	spa_sync_frees(spa, free_bpl, tx);
	} else {
	/*
	* We can not defer frees in pass 1, because
	* we sync the deferred frees later in pass 1.
	*/
	ASSERT3U(pass, >, 1);
	bplist_iterate(free_bpl, bpobj_enqueue_cb,
	&spa->spa_deferred_bpobj, tx);
	}

	ddt_sync(spa, txg);
	dsl_scan_sync(dp, tx);

	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
	vdev_sync(vd, txg);

	if (pass == 1) {
	spa_sync_upgrades(spa, tx);
	ASSERT3U(txg, >=,
	spa->spa_uberblock.ub_rootbp.blk_birth);
	/*
	* Note: We need to check if the MOS is dirty
	* because we could have marked the MOS dirty
	* without updating the uberblock (e.g. if we
	* have sync tasks but no dirty user data). We
	* need to check the uberblock's rootbp because
	* it is updated if we have synced out dirty
	* data (though in this case the MOS will most
	* likely also be dirty due to second order
	* effects, we don't want to rely on that here).
	*/
	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
	!dmu_objset_is_dirty(mos, txg)) {
	/*
	* Nothing changed on the first pass,
	* therefore this TXG is a no-op. Avoid
	* syncing deferred frees, so that we
	* can keep this TXG as a no-op.
	*/
	ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
	txg));
	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
	ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
	break;
	}
	spa_sync_deferred_frees(spa, tx);
	}

	} while (dmu_objset_is_dirty(mos, txg));

	/*
	* Rewrite the vdev configuration (which includes the uberblock)
	* to commit the transaction group.
	*
	* If there are no dirty vdevs, we sync the uberblock to a few
	* random top-level vdevs that are known to be visible in the
	* config cache (see spa_vdev_add() for a complete description).
	* If there are dirty vdevs, sync the uberblock to all vdevs.
	*/
	for (;;) {
	/*
	* We hold SCL_STATE to prevent vdev open/close/etc.
	* while we're attempting to write the vdev labels.
	*/
	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

	if (list_is_empty(&spa->spa_config_dirty_list)) {
	vdev_t *svd[SPA_DVAS_PER_BP];
	int svdcount = 0;
	int children = rvd->vdev_children;
	int c0 = spa_get_random(children);

	for (int c = 0; c < children; c++) {
	vd = rvd->vdev_child[(c0 + c) % children];
	if (vd->vdev_ms_array == 0 \|\| vd->vdev_islog)
	continue;
	svd[svdcount++] = vd;
	if (svdcount == SPA_DVAS_PER_BP)
	break;
	}
	error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
	if (error != 0)
	error = vdev_config_sync(svd, svdcount, txg,
	B_TRUE);
	} else {
	error = vdev_config_sync(rvd->vdev_child,
	rvd->vdev_children, txg, B_FALSE);
	if (error != 0)
	error = vdev_config_sync(rvd->vdev_child,
	rvd->vdev_children, txg, B_TRUE);
	}

	if (error == 0)
	spa->spa_last_synced_guid = rvd->vdev_guid;

	spa_config_exit(spa, SCL_STATE, FTAG);

	if (error == 0)
	break;
	zio_suspend(spa, NULL);
	zio_resume_wait(spa);
	}
	dmu_tx_commit(tx);

	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));

	/*
	* Clear the dirty config list.
	*/
	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
	vdev_config_clean(vd);

	/*
	* Now that the new config has synced transactionally,
	* let it become visible to the config cache.
	*/
	if (spa->spa_config_syncing != NULL) {
	spa_config_set(spa, spa->spa_config_syncing);
	spa->spa_config_txg = txg;
	spa->spa_config_syncing = NULL;
	}

	spa->spa_ubsync = spa->spa_uberblock;

	dsl_pool_sync_done(dp, txg);

	/*
	* Update usable space statistics.
	*/
	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
	vdev_sync_done(vd, txg);

	spa_update_dspace(spa);

	/*
	* It had better be the case that we didn't dirty anything
	* since vdev_config_sync().
	*/
	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));

	spa->spa_sync_pass = 0;

	spa_config_exit(spa, SCL_CONFIG, FTAG);

	spa_handle_ignored_writes(spa);

	/*
	* If any async tasks have been requested, kick them off.
	*/
	spa_async_dispatch(spa);
	}

	/*
	* Sync all pools. We don't want to hold the namespace lock across these
	* operations, so we take a reference on the spa_t and drop the lock during the
	* sync.
	*/
	void
	spa_sync_allpools(void)
	{
	spa_t *spa = NULL;
	mutex_enter(&spa_namespace_lock);
	while ((spa = spa_next(spa)) != NULL) {
	if (spa_state(spa) != POOL_STATE_ACTIVE \|\|
	!spa_writeable(spa) \|\| spa_suspended(spa))
	continue;
	spa_open_ref(spa, FTAG);
	mutex_exit(&spa_namespace_lock);
	txg_wait_synced(spa_get_dsl(spa), 0);
	mutex_enter(&spa_namespace_lock);
	spa_close(spa, FTAG);
	}
	mutex_exit(&spa_namespace_lock);
	}

	/*
	* ==========================================================================
	* Miscellaneous routines
	* ==========================================================================
	*/

	/*
	* Remove all pools in the system.
	*/
	void
	spa_evict_all(void)
	{
	spa_t *spa;

	/*
	* Remove all cached state. All pools should be closed now,
	* so every spa in the AVL tree should be unreferenced.
	*/
	mutex_enter(&spa_namespace_lock);
	while ((spa = spa_next(NULL)) != NULL) {
	/*
	* Stop async tasks. The async thread may need to detach
	* a device that's been replaced, which requires grabbing
	* spa_namespace_lock, so we must drop it here.
	*/
	spa_open_ref(spa, FTAG);
	mutex_exit(&spa_namespace_lock);
	spa_async_suspend(spa);
	mutex_enter(&spa_namespace_lock);
	spa_close(spa, FTAG);

	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
	spa_unload(spa);
	spa_deactivate(spa);
	}
	spa_remove(spa);
	}
	mutex_exit(&spa_namespace_lock);
	}

	vdev_t *
	spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
	{
	vdev_t *vd;
	int i;

	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
	return (vd);

	if (aux) {
	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
	vd = spa->spa_l2cache.sav_vdevs[i];
	if (vd->vdev_guid == guid)
	return (vd);
	}

	for (i = 0; i < spa->spa_spares.sav_count; i++) {
	vd = spa->spa_spares.sav_vdevs[i];
	if (vd->vdev_guid == guid)
	return (vd);
	}
	}

	return (NULL);
	}

	void
	spa_upgrade(spa_t *spa, uint64_t version)
	{
	ASSERT(spa_writeable(spa));

	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);

	/*
	* This should only be called for a non-faulted pool, and since a
	* future version would result in an unopenable pool, this shouldn't be
	* possible.
	*/
	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);

	spa->spa_uberblock.ub_version = version;
	vdev_config_dirty(spa->spa_root_vdev);

	spa_config_exit(spa, SCL_ALL, FTAG);

	txg_wait_synced(spa_get_dsl(spa), 0);
	}

	boolean_t
	spa_has_spare(spa_t *spa, uint64_t guid)
	{
	int i;
	uint64_t spareguid;
	spa_aux_vdev_t *sav = &spa->spa_spares;

	for (i = 0; i < sav->sav_count; i++)
	if (sav->sav_vdevs[i]->vdev_guid == guid)
	return (B_TRUE);

	for (i = 0; i < sav->sav_npending; i++) {
	if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
	&spareguid) == 0 && spareguid == guid)
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	/*
	* Check if a pool has an active shared spare device.
	* Note: reference count of an active spare is 2, as a spare and as a replace
	*/
	static boolean_t
	spa_has_active_shared_spare(spa_t *spa)
	{
	int i, refcnt;
	uint64_t pool;
	spa_aux_vdev_t *sav = &spa->spa_spares;

	for (i = 0; i < sav->sav_count; i++) {
	if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
	&refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
	refcnt > 2)
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	/*
	* Post a sysevent corresponding to the given event. The 'name' must be one of
	* the event definitions in sys/sysevent/eventdefs.h. The payload will be
	* filled in from the spa and (optionally) the vdev. This doesn't do anything
	* in the userland libzpool, as we don't want consumers to misinterpret ztest
	* or zdb as real changes.
	*/
	void
	spa_event_notify(spa_t spa, vdev_t vd, const char *name)
	{
	#ifdef _KERNEL
	sysevent_t *ev;
	sysevent_attr_list_t *attr = NULL;
	sysevent_value_t value;
	sysevent_id_t eid;

	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
	SE_SLEEP);

	value.value_type = SE_DATA_TYPE_STRING;
	value.value.sv_string = spa_name(spa);
	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
	goto done;

	value.value_type = SE_DATA_TYPE_UINT64;
	value.value.sv_uint64 = spa_guid(spa);
	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
	goto done;

	if (vd) {
	value.value_type = SE_DATA_TYPE_UINT64;
	value.value.sv_uint64 = vd->vdev_guid;
	if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
	SE_SLEEP) != 0)
	goto done;

	if (vd->vdev_path) {
	value.value_type = SE_DATA_TYPE_STRING;
	value.value.sv_string = vd->vdev_path;
	if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
	&value, SE_SLEEP) != 0)
	goto done;
	}
	}

	if (sysevent_attach_attributes(ev, attr) != 0)
	goto done;
	attr = NULL;

	(void) log_sysevent(ev, SE_SLEEP, &eid);

	done:
	if (attr)
	sysevent_free_attr(attr);
	sysevent_free(ev);
	#endif
	}
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h (revision 284762)
	@@ -1,914 +1,915 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
	* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
	* Copyright (c) 2012, Joyent, Inc. All rights reserved.
	* Copyright 2013 DEY Storage Systems, Inc.
	* Copyright 2014 HybridCluster. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#ifndef _SYS_DMU_H
	#define _SYS_DMU_H

	/*
	* This file describes the interface that the DMU provides for its
	* consumers.
	*
	* The DMU also interacts with the SPA. That interface is described in
	* dmu_spa.h.
	*/

	#include <sys/zfs_context.h>
	#include <sys/inttypes.h>
	#include <sys/cred.h>
	#include <sys/fs/zfs.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct uio;
	struct xuio;
	struct page;
	struct vnode;
	struct spa;
	struct zilog;
	struct zio;
	struct blkptr;
	struct zap_cursor;
	struct dsl_dataset;
	struct dsl_pool;
	struct dnode;
	struct drr_begin;
	struct drr_end;
	struct zbookmark_phys;
	struct spa;
	struct nvlist;
	struct arc_buf;
	struct zio_prop;
	struct sa_handle;

	typedef struct objset objset_t;
	typedef struct dmu_tx dmu_tx_t;
	typedef struct dsl_dir dsl_dir_t;

	typedef enum dmu_object_byteswap {
	DMU_BSWAP_UINT8,
	DMU_BSWAP_UINT16,
	DMU_BSWAP_UINT32,
	DMU_BSWAP_UINT64,
	DMU_BSWAP_ZAP,
	DMU_BSWAP_DNODE,
	DMU_BSWAP_OBJSET,
	DMU_BSWAP_ZNODE,
	DMU_BSWAP_OLDACL,
	DMU_BSWAP_ACL,
	/*
	* Allocating a new byteswap type number makes the on-disk format
	* incompatible with any other format that uses the same number.
	*
	* Data can usually be structured to work with one of the
	* DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
	*/
	DMU_BSWAP_NUMFUNCS
	} dmu_object_byteswap_t;

	#define DMU_OT_NEWTYPE 0x80
	#define DMU_OT_METADATA 0x40
	#define DMU_OT_BYTESWAP_MASK 0x3f

	/*
	* Defines a uint8_t object type. Object types specify if the data
	* in the object is metadata (boolean) and how to byteswap the data
	* (dmu_object_byteswap_t).
	*/
	#define DMU_OT(byteswap, metadata) \
	(DMU_OT_NEWTYPE \| \
	((metadata) ? DMU_OT_METADATA : 0) \| \
	((byteswap) & DMU_OT_BYTESWAP_MASK))

	#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
	(ot) < DMU_OT_NUMTYPES)

	#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	((ot) & DMU_OT_METADATA) : \
	dmu_ot[(ot)].ot_metadata)

	/*
	* These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
	* have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
	* is repurposed for embedded BPs.
	*/
	#define DMU_OT_HAS_FILL(ot) \
	((ot) == DMU_OT_DNODE \|\| (ot) == DMU_OT_OBJSET)

	#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
	((ot) & DMU_OT_BYTESWAP_MASK) : \
	dmu_ot[(ot)].ot_byteswap)

	typedef enum dmu_object_type {
	DMU_OT_NONE,
	/* general: */
	DMU_OT_OBJECT_DIRECTORY, /* ZAP */
	DMU_OT_OBJECT_ARRAY, /* UINT64 */
	DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
	DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
	DMU_OT_BPOBJ, /* UINT64 */
	DMU_OT_BPOBJ_HDR, /* UINT64 */
	/* spa: */
	DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
	DMU_OT_SPACE_MAP, /* UINT64 */
	/* zil: */
	DMU_OT_INTENT_LOG, /* UINT64 */
	/* dmu: */
	DMU_OT_DNODE, /* DNODE */
	DMU_OT_OBJSET, /* OBJSET */
	/* dsl: */
	DMU_OT_DSL_DIR, /* UINT64 */
	DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
	DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
	DMU_OT_DSL_PROPS, /* ZAP */
	DMU_OT_DSL_DATASET, /* UINT64 */
	/* zpl: */
	DMU_OT_ZNODE, /* ZNODE */
	DMU_OT_OLDACL, /* Old ACL */
	DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
	DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
	DMU_OT_MASTER_NODE, /* ZAP */
	DMU_OT_UNLINKED_SET, /* ZAP */
	/* zvol: */
	DMU_OT_ZVOL, /* UINT8 */
	DMU_OT_ZVOL_PROP, /* ZAP */
	/* other; for testing only! */
	DMU_OT_PLAIN_OTHER, /* UINT8 */
	DMU_OT_UINT64_OTHER, /* UINT64 */
	DMU_OT_ZAP_OTHER, /* ZAP */
	/* new object types: */
	DMU_OT_ERROR_LOG, /* ZAP */
	DMU_OT_SPA_HISTORY, /* UINT8 */
	DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
	DMU_OT_POOL_PROPS, /* ZAP */
	DMU_OT_DSL_PERMS, /* ZAP */
	DMU_OT_ACL, /* ACL */
	DMU_OT_SYSACL, /* SYSACL */
	DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */
	DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
	DMU_OT_NEXT_CLONES, /* ZAP */
	DMU_OT_SCAN_QUEUE, /* ZAP */
	DMU_OT_USERGROUP_USED, /* ZAP */
	DMU_OT_USERGROUP_QUOTA, /* ZAP */
	DMU_OT_USERREFS, /* ZAP */
	DMU_OT_DDT_ZAP, /* ZAP */
	DMU_OT_DDT_STATS, /* ZAP */
	DMU_OT_SA, /* System attr */
	DMU_OT_SA_MASTER_NODE, /* ZAP */
	DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */
	DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
	DMU_OT_SCAN_XLATE, /* ZAP */
	DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
	DMU_OT_DEADLIST, /* ZAP */
	DMU_OT_DEADLIST_HDR, /* UINT64 */
	DMU_OT_DSL_CLONES, /* ZAP */
	DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
	/*
	* Do not allocate new object types here. Doing so makes the on-disk
	* format incompatible with any other format that uses the same object
	* type number.
	*
	* When creating an object which does not have one of the above types
	* use the DMU_OTN_* type with the correct byteswap and metadata
	* values.
	*
	* The DMU_OTN_* types do not have entries in the dmu_ot table,
	* use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
	* of indexing into dmu_ot directly (this works for both DMU_OT_* types
	* and DMU_OTN_* types).
	*/
	DMU_OT_NUMTYPES,

	/*
	* Names for valid types declared with DMU_OT().
	*/
	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
	} dmu_object_type_t;

	typedef enum txg_how {
	TXG_WAIT = 1,
	TXG_NOWAIT,
	TXG_WAITED,
	} txg_how_t;

	void byteswap_uint64_array(void *buf, size_t size);
	void byteswap_uint32_array(void *buf, size_t size);
	void byteswap_uint16_array(void *buf, size_t size);
	void byteswap_uint8_array(void *buf, size_t size);
	void zap_byteswap(void *buf, size_t size);
	void zfs_oldacl_byteswap(void *buf, size_t size);
	void zfs_acl_byteswap(void *buf, size_t size);
	void zfs_znode_byteswap(void *buf, size_t size);

	#define DS_FIND_SNAPSHOTS (1<<0)
	#define DS_FIND_CHILDREN (1<<1)
	+#define DS_FIND_SERIALIZE (1<<2)

	/*
	* The maximum number of bytes that can be accessed as part of one
	* operation, including metadata.
	*/
	#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
	#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */

	#define DMU_USERUSED_OBJECT (-1ULL)
	#define DMU_GROUPUSED_OBJECT (-2ULL)

	/*
	* artificial blkids for bonus buffer and spill blocks
	*/
	#define DMU_BONUS_BLKID (-1ULL)
	#define DMU_SPILL_BLKID (-2ULL)
	/*
	* Public routines to create, destroy, open, and close objsets.
	*/
	int dmu_objset_hold(const char name, void tag, objset_t **osp);
	int dmu_objset_own(const char *name, dmu_objset_type_t type,
	boolean_t readonly, void tag, objset_t *osp);
	void dmu_objset_rele(objset_t os, void tag);
	void dmu_objset_disown(objset_t os, void tag);
	int dmu_objset_open_ds(struct dsl_dataset ds, objset_t *osp);

	void dmu_objset_evict_dbufs(objset_t *os);
	int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
	void (func)(objset_t os, void arg, cred_t cr, dmu_tx_t tx), void arg);
	int dmu_objset_clone(const char name, const char origin);
	int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
	struct nvlist *errlist);
	int dmu_objset_snapshot_one(const char fsname, const char snapname);
	int dmu_objset_snapshot_tmp(const char , const char , int);
	int dmu_objset_find(char name, int func(const char , void ), void arg,
	int flags);
	void dmu_objset_byteswap(void *buf, size_t size);
	int dsl_dataset_rename_snapshot(const char *fsname,
	const char oldsnapname, const char newsnapname, boolean_t recursive);

	typedef struct dmu_buf {
	uint64_t db_object; /* object that this buffer is part of */
	uint64_t db_offset; /* byte offset in this object */
	uint64_t db_size; /* size of buffer in bytes */
	void db_data; / data in buffer */
	} dmu_buf_t;

	/*
	* The names of zap entries in the DIRECTORY_OBJECT of the MOS.
	*/
	#define DMU_POOL_DIRECTORY_OBJECT 1
	#define DMU_POOL_CONFIG "config"
	#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
	#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
	#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
	#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg"
	#define DMU_POOL_ROOT_DATASET "root_dataset"
	#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
	#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
	#define DMU_POOL_ERRLOG_LAST "errlog_last"
	#define DMU_POOL_SPARES "spares"
	#define DMU_POOL_DEFLATE "deflate"
	#define DMU_POOL_HISTORY "history"
	#define DMU_POOL_PROPS "pool_props"
	#define DMU_POOL_L2CACHE "l2cache"
	#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
	#define DMU_POOL_DDT "DDT-%s-%s-%s"
	#define DMU_POOL_DDT_STATS "DDT-statistics"
	#define DMU_POOL_CREATION_VERSION "creation_version"
	#define DMU_POOL_SCAN "scan"
	#define DMU_POOL_FREE_BPOBJ "free_bpobj"
	#define DMU_POOL_BPTREE_OBJ "bptree_obj"
	#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"

	/*
	* Allocate an object from this objset. The range of object numbers
	* available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
	*
	* The transaction must be assigned to a txg. The newly allocated
	* object will be "held" in the transaction (ie. you can modify the
	* newly allocated object in this transaction).
	*
	* dmu_object_alloc() chooses an object and returns it in *objectp.
	*
	* dmu_object_claim() allocates a specific object number. If that
	* number is already allocated, it fails and returns EEXIST.
	*
	* Return 0 on success, or ENOSPC or EEXIST as specified above.
	*/
	uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
	int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
	int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
	int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
	int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);

	/*
	* Free an object from this objset.
	*
	* The object's data will be freed as well (ie. you don't need to call
	* dmu_free(object, 0, -1, tx)).
	*
	* The object need not be held in the transaction.
	*
	* If there are any holds on this object's buffers (via dmu_buf_hold()),
	* or tx holds on the object (via dmu_tx_hold_object()), you can not
	* free it; it fails and returns EBUSY.
	*
	* If the object is not allocated, it fails and returns ENOENT.
	*
	* Return 0 on success, or EBUSY or ENOENT as specified above.
	*/
	int dmu_object_free(objset_t os, uint64_t object, dmu_tx_t tx);

	/*
	* Find the next allocated or free object.
	*
	* The objectp parameter is in-out. It will be updated to be the next
	* object which is allocated. Ignore objects which have not been
	* modified since txg.
	*
	* XXX Can only be called on a objset with no dirty data.
	*
	* Returns 0 on success, or ENOENT if there are no more objects.
	*/
	int dmu_object_next(objset_t os, uint64_t objectp,
	boolean_t hole, uint64_t txg);

	/*
	* Set the data blocksize for an object.
	*
	* The object cannot have any blocks allcated beyond the first. If
	* the first block is allocated already, the new size must be greater
	* than the current block size. If these conditions are not met,
	* ENOTSUP will be returned.
	*
	* Returns 0 on success, or EBUSY if there are any holds on the object
	* contents, or ENOTSUP as described above.
	*/
	int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
	int ibs, dmu_tx_t *tx);

	/*
	* Set the checksum property on a dnode. The new checksum algorithm will
	* apply to all newly written blocks; existing blocks will not be affected.
	*/
	void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
	dmu_tx_t *tx);

	/*
	* Set the compress property on a dnode. The new compression algorithm will
	* apply to all newly written blocks; existing blocks will not be affected.
	*/
	void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
	dmu_tx_t *tx);

	void
	dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
	void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
	int compressed_size, int byteorder, dmu_tx_t *tx);

	/*
	* Decide how to write a block: checksum, compression, number of copies, etc.
	*/
	#define WP_NOFILL 0x1
	#define WP_DMU_SYNC 0x2
	#define WP_SPILL 0x4

	void dmu_write_policy(objset_t os, struct dnode dn, int level, int wp,
	struct zio_prop *zp);
	/*
	* The bonus data is accessed more or less like a regular buffer.
	* You must dmu_bonus_hold() to get the buffer, which will give you a
	* dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
	* data. As with any normal buffer, you must call dmu_buf_read() to
	* read db_data, dmu_buf_will_dirty() before modifying it, and the
	* object must be held in an assigned transaction before calling
	* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
	* buffer as well. You must release your hold with dmu_buf_rele().
	*
	* Returns ENOENT, EIO, or 0.
	*/
	int dmu_bonus_hold(objset_t os, uint64_t object, void tag, dmu_buf_t **);
	int dmu_bonus_max(void);
	int dmu_set_bonus(dmu_buf_t , int, dmu_tx_t );
	int dmu_set_bonustype(dmu_buf_t , dmu_object_type_t, dmu_tx_t );
	dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
	int dmu_rm_spill(objset_t , uint64_t, dmu_tx_t );

	/*
	* Special spill buffer support used by "SA" framework
	*/

	int dmu_spill_hold_by_bonus(dmu_buf_t bonus, void tag, dmu_buf_t **dbp);
	int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
	void tag, dmu_buf_t *dbp);
	int dmu_spill_hold_existing(dmu_buf_t bonus, void tag, dmu_buf_t **dbp);

	/*
	* Obtain the DMU buffer from the specified object which contains the
	* specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
	* that it will remain in memory. You must release the hold with
	* dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
	* hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
	*
	* You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
	* on the returned buffer before reading or writing the buffer's
	* db_data. The comments for those routines describe what particular
	* operations are valid after calling them.
	*
	* The object number must be a valid, allocated object number.
	*/
	int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
	void tag, dmu_buf_t *, int flags);
	void dmu_buf_add_ref(dmu_buf_t db, void tag);
	void dmu_buf_rele(dmu_buf_t db, void tag);
	uint64_t dmu_buf_refcount(dmu_buf_t *db);

	/*
	* dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
	* range of an object. A pointer to an array of dmu_buf_t*'s is
	* returned (in *dbpp).
	*
	* dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
	* frees the array. The hold on the array of buffers MUST be released
	* with dmu_buf_rele_array. You can NOT release the hold on each buffer
	* individually with dmu_buf_rele.
	*/
	int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
	uint64_t length, int read, void tag, int numbufsp, dmu_buf_t ***dbpp);
	void dmu_buf_rele_array(dmu_buf_t *, int numbufs, void tag);

	typedef void dmu_buf_evict_func_t(void *user_ptr);

	/*
	* A DMU buffer user object may be associated with a dbuf for the
	* duration of its lifetime. This allows the user of a dbuf (client)
	* to attach private data to a dbuf (e.g. in-core only data such as a
	* dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
	* when that dbuf has been evicted. Clients typically respond to the
	* eviction notification by freeing their private data, thus ensuring
	* the same lifetime for both dbuf and private data.
	*
	* The mapping from a dmu_buf_user_t to any client private data is the
	* client's responsibility. All current consumers of the API with private
	* data embed a dmu_buf_user_t as the first member of the structure for
	* their private data. This allows conversions between the two types
	* with a simple cast. Since the DMU buf user API never needs access
	* to the private data, other strategies can be employed if necessary
	* or convenient for the client (e.g. using container_of() to do the
	* conversion for private data that cannot have the dmu_buf_user_t as
	* its first member).
	*
	* Eviction callbacks are executed without the dbuf mutex held or any
	* other type of mechanism to guarantee that the dbuf is still available.
	* For this reason, users must assume the dbuf has already been freed
	* and not reference the dbuf from the callback context.
	*
	* Users requesting "immediate eviction" are notified as soon as the dbuf
	* is only referenced by dirty records (dirties == holds). Otherwise the
	* notification occurs after eviction processing for the dbuf begins.
	*/
	typedef struct dmu_buf_user {
	/*
	* Asynchronous user eviction callback state.
	*/
	taskq_ent_t dbu_tqent;

	/* This instance's eviction function pointer. */
	dmu_buf_evict_func_t *dbu_evict_func;
	#ifdef ZFS_DEBUG
	/*
	* Pointer to user's dbuf pointer. NULL for clients that do
	* not associate a dbuf with their user data.
	*
	* The dbuf pointer is cleared upon eviction so as to catch
	* use-after-evict bugs in clients.
	*/
	dmu_buf_t **dbu_clear_on_evict_dbufp;
	#endif
	} dmu_buf_user_t;

	/*
	* Initialize the given dmu_buf_user_t instance with the eviction function
	* evict_func, to be called when the user is evicted.
	*
	* NOTE: This function should only be called once on a given dmu_buf_user_t.
	* To allow enforcement of this, dbu must already be zeroed on entry.
	*/
	#ifdef __lint
	/* Very ugly, but it beats issuing suppression directives in many Makefiles. */
	extern void
	dmu_buf_init_user(dmu_buf_user_t dbu, dmu_buf_evict_func_t evict_func,
	dmu_buf_t **clear_on_evict_dbufp);
	#else /* __lint */
	inline void
	dmu_buf_init_user(dmu_buf_user_t dbu, dmu_buf_evict_func_t evict_func,
	dmu_buf_t **clear_on_evict_dbufp)
	{
	ASSERT(dbu->dbu_evict_func == NULL);
	ASSERT(evict_func != NULL);
	dbu->dbu_evict_func = evict_func;
	#ifdef ZFS_DEBUG
	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
	#endif
	}
	#endif /* __lint */

	/*
	* Attach user data to a dbuf and mark it for normal (when the dbuf's
	* data is cleared or its reference count goes to zero) eviction processing.
	*
	* Returns NULL on success, or the existing user if another user currently
	* owns the buffer.
	*/
	void dmu_buf_set_user(dmu_buf_t db, dmu_buf_user_t *user);

	/*
	* Attach user data to a dbuf and mark it for immediate (its dirty and
	* reference counts are equal) eviction processing.
	*
	* Returns NULL on success, or the existing user if another user currently
	* owns the buffer.
	*/
	void dmu_buf_set_user_ie(dmu_buf_t db, dmu_buf_user_t *user);

	/*
	* Replace the current user of a dbuf.
	*
	* If given the current user of a dbuf, replaces the dbuf's user with
	* "new_user" and returns the user data pointer that was replaced.
	* Otherwise returns the current, and unmodified, dbuf user pointer.
	*/
	void dmu_buf_replace_user(dmu_buf_t db,
	dmu_buf_user_t old_user, dmu_buf_user_t new_user);

	/*
	* Remove the specified user data for a DMU buffer.
	*
	* Returns the user that was removed on success, or the current user if
	* another user currently owns the buffer.
	*/
	void dmu_buf_remove_user(dmu_buf_t db, dmu_buf_user_t *user);

	/*
	* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
	*/
	void dmu_buf_get_user(dmu_buf_t db);

	/* Block until any in-progress dmu buf user evictions complete. */
	void dmu_buf_user_evict_wait(void);

	/*
	* Returns the blkptr associated with this dbuf, or NULL if not set.
	*/
	struct blkptr dmu_buf_get_blkptr(dmu_buf_t db);

	/*
	* Indicate that you are going to modify the buffer's data (db_data).
	*
	* The transaction (tx) must be assigned to a txg (ie. you've called
	* dmu_tx_assign()). The buffer's object must be held in the tx
	* (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
	*/
	void dmu_buf_will_dirty(dmu_buf_t db, dmu_tx_t tx);

	/*
	* Tells if the given dbuf is freeable.
	*/
	boolean_t dmu_buf_freeable(dmu_buf_t *);

	/*
	* You must create a transaction, then hold the objects which you will
	* (or might) modify as part of this transaction. Then you must assign
	* the transaction to a transaction group. Once the transaction has
	* been assigned, you can modify buffers which belong to held objects as
	* part of this transaction. You can't modify buffers before the
	* transaction has been assigned; you can't modify buffers which don't
	* belong to objects which this transaction holds; you can't hold
	* objects once the transaction has been assigned. You may hold an
	* object which you are going to free (with dmu_object_free()), but you
	* don't have to.
	*
	* You can abort the transaction before it has been assigned.
	*
	* Note that you may hold buffers (with dmu_buf_hold) at any time,
	* regardless of transaction state.
	*/

	#define DMU_NEW_OBJECT (-1ULL)
	#define DMU_OBJECT_END (-1ULL)

	dmu_tx_t dmu_tx_create(objset_t os);
	void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
	void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
	uint64_t len);
	void dmu_tx_hold_zap(dmu_tx_t tx, uint64_t object, int add, const char name);
	void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
	void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
	void dmu_tx_hold_sa(dmu_tx_t tx, struct sa_handle hdl, boolean_t may_grow);
	void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
	void dmu_tx_abort(dmu_tx_t *tx);
	int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
	void dmu_tx_wait(dmu_tx_t *tx);
	void dmu_tx_commit(dmu_tx_t *tx);
	void dmu_tx_mark_netfree(dmu_tx_t *tx);

	/*
	* To register a commit callback, dmu_tx_callback_register() must be called.
	*
	* dcb_data is a pointer to caller private data that is passed on as a
	* callback parameter. The caller is responsible for properly allocating and
	* freeing it.
	*
	* When registering a callback, the transaction must be already created, but
	* it cannot be committed or aborted. It can be assigned to a txg or not.
	*
	* The callback will be called after the transaction has been safely written
	* to stable storage and will also be called if the dmu_tx is aborted.
	* If there is any error which prevents the transaction from being committed to
	* disk, the callback will be called with a value of error != 0.
	*/
	typedef void dmu_tx_callback_func_t(void *dcb_data, int error);

	void dmu_tx_callback_register(dmu_tx_t tx, dmu_tx_callback_func_t dcb_func,
	void *dcb_data);

	/*
	* Free up the data blocks for a defined range of a file. If size is
	* -1, the range from offset to end-of-file is freed.
	*/
	int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t size, dmu_tx_t *tx);
	int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t size);
	int dmu_free_long_object(objset_t *os, uint64_t object);

	/*
	* Convenience functions.
	*
	* Canfail routines will return 0 on success, or an errno if there is a
	* nonrecoverable I/O error.
	*/
	#define DMU_READ_PREFETCH 0 /* prefetch */
	#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
	int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	void *buf, uint32_t flags);
	void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	const void buf, dmu_tx_t tx);
	void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
	dmu_tx_t *tx);
	int dmu_read_uio(objset_t os, uint64_t object, struct uio uio, uint64_t size);
	int dmu_read_uio_dbuf(dmu_buf_t zdb, struct uio uio, uint64_t size);
	int dmu_write_uio(objset_t os, uint64_t object, struct uio uio, uint64_t size,
	dmu_tx_t *tx);
	int dmu_write_uio_dbuf(dmu_buf_t zdb, struct uio uio, uint64_t size,
	dmu_tx_t *tx);
	int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t size, struct page pp, dmu_tx_t tx);
	struct arc_buf dmu_request_arcbuf(dmu_buf_t handle, int size);
	void dmu_return_arcbuf(struct arc_buf *buf);
	void dmu_assign_arcbuf(dmu_buf_t handle, uint64_t offset, struct arc_buf buf,
	dmu_tx_t *tx);
	int dmu_xuio_init(struct xuio *uio, int niov);
	void dmu_xuio_fini(struct xuio *uio);
	int dmu_xuio_add(struct xuio uio, struct arc_buf abuf, offset_t off,
	size_t n);
	int dmu_xuio_cnt(struct xuio *uio);
	struct arc_buf dmu_xuio_arcbuf(struct xuio uio, int i);
	void dmu_xuio_clear(struct xuio *uio, int i);
	void xuio_stat_wbuf_copied();
	void xuio_stat_wbuf_nocopy();

	extern int zfs_prefetch_disable;
	extern int zfs_max_recordsize;

	/*
	* Asynchronously try to read in the data.
	*/
	void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
	uint64_t len);

	typedef struct dmu_object_info {
	/* All sizes are in bytes unless otherwise indicated. */
	uint32_t doi_data_block_size;
	uint32_t doi_metadata_block_size;
	dmu_object_type_t doi_type;
	dmu_object_type_t doi_bonus_type;
	uint64_t doi_bonus_size;
	uint8_t doi_indirection; /* 2 = dnode->indirect->data */
	uint8_t doi_checksum;
	uint8_t doi_compress;
	uint8_t doi_nblkptr;
	uint8_t doi_pad[4];
	uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
	uint64_t doi_max_offset;
	uint64_t doi_fill_count; /* number of non-empty blocks */
	} dmu_object_info_t;

	typedef void arc_byteswap_func_t(void *buf, size_t size);

	typedef struct dmu_object_type_info {
	dmu_object_byteswap_t ot_byteswap;
	boolean_t ot_metadata;
	char *ot_name;
	} dmu_object_type_info_t;

	typedef struct dmu_object_byteswap_info {
	arc_byteswap_func_t *ob_func;
	char *ob_name;
	} dmu_object_byteswap_info_t;

	extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
	extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];

	/*
	* Get information on a DMU object.
	*
	* Return 0 on success or ENOENT if object is not allocated.
	*
	* If doi is NULL, just indicates whether the object exists.
	*/
	int dmu_object_info(objset_t os, uint64_t object, dmu_object_info_t doi);
	/* Like dmu_object_info, but faster if you have a held dnode in hand. */
	void dmu_object_info_from_dnode(struct dnode dn, dmu_object_info_t doi);
	/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
	void dmu_object_info_from_db(dmu_buf_t db, dmu_object_info_t doi);
	/*
	* Like dmu_object_info_from_db, but faster still when you only care about
	* the size. This is specifically optimized for zfs_getattr().
	*/
	void dmu_object_size_from_db(dmu_buf_t db, uint32_t blksize,
	u_longlong_t *nblk512);

	typedef struct dmu_objset_stats {
	uint64_t dds_num_clones; /* number of clones of this */
	uint64_t dds_creation_txg;
	uint64_t dds_guid;
	dmu_objset_type_t dds_type;
	uint8_t dds_is_snapshot;
	uint8_t dds_inconsistent;
	char dds_origin[MAXNAMELEN];
	} dmu_objset_stats_t;

	/*
	* Get stats on a dataset.
	*/
	void dmu_objset_fast_stat(objset_t os, dmu_objset_stats_t stat);

	/*
	* Add entries to the nvlist for all the objset's properties. See
	* zfs_prop_table[] and zfs(1m) for details on the properties.
	*/
	void dmu_objset_stats(objset_t os, struct nvlist nv);

	/*
	* Get the space usage statistics for statvfs().
	*
	* refdbytes is the amount of space "referenced" by this objset.
	* availbytes is the amount of space available to this objset, taking
	* into account quotas & reservations, assuming that no other objsets
	* use the space first. These values correspond to the 'referenced' and
	* 'available' properties, described in the zfs(1m) manpage.
	*
	* usedobjs and availobjs are the number of objects currently allocated,
	* and available.
	*/
	void dmu_objset_space(objset_t os, uint64_t refdbytesp, uint64_t *availbytesp,
	uint64_t usedobjsp, uint64_t availobjsp);

	/*
	* The fsid_guid is a 56-bit ID that can change to avoid collisions.
	* (Contrast with the ds_guid which is a 64-bit ID that will never
	* change, so there is a small probability that it will collide.)
	*/
	uint64_t dmu_objset_fsid_guid(objset_t *os);

	/*
	* Get the [cm]time for an objset's snapshot dir
	*/
	timestruc_t dmu_objset_snap_cmtime(objset_t *os);

	int dmu_objset_is_snapshot(objset_t *os);

	extern struct spa dmu_objset_spa(objset_t os);
	extern struct zilog dmu_objset_zil(objset_t os);
	extern struct dsl_pool dmu_objset_pool(objset_t os);
	extern struct dsl_dataset dmu_objset_ds(objset_t os);
	extern void dmu_objset_name(objset_t os, char buf);
	extern dmu_objset_type_t dmu_objset_type(objset_t *os);
	extern uint64_t dmu_objset_id(objset_t *os);
	extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
	extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
	extern int dmu_snapshot_list_next(objset_t os, int namelen, char name,
	uint64_t id, uint64_t offp, boolean_t *case_conflict);
	extern int dmu_snapshot_realname(objset_t os, char name, char *real,
	int maxlen, boolean_t *conflict);
	extern int dmu_dir_list_next(objset_t os, int namelen, char name,
	uint64_t idp, uint64_t offp);

	typedef int objset_used_cb_t(dmu_object_type_t bonustype,
	void bonus, uint64_t userp, uint64_t *groupp);
	extern void dmu_objset_register_type(dmu_objset_type_t ost,
	objset_used_cb_t *cb);
	extern void dmu_objset_set_user(objset_t os, void user_ptr);
	extern void dmu_objset_get_user(objset_t os);

	/*
	* Return the txg number for the given assigned transaction.
	*/
	uint64_t dmu_tx_get_txg(dmu_tx_t *tx);

	/*
	* Synchronous write.
	* If a parent zio is provided this function initiates a write on the
	* provided buffer as a child of the parent zio.
	* In the absence of a parent zio, the write is completed synchronously.
	* At write completion, blk is filled with the bp of the written block.
	* Note that while the data covered by this function will be on stable
	* storage when the write completes this new data does not become a
	* permanent part of the file until the associated transaction commits.
	*/

	/*
	* {zfs,zvol,ztest}_get_done() args
	*/
	typedef struct zgd {
	struct zilog *zgd_zilog;
	struct blkptr *zgd_bp;
	dmu_buf_t *zgd_db;
	struct rl *zgd_rl;
	void *zgd_private;
	} zgd_t;

	typedef void dmu_sync_cb_t(zgd_t *arg, int error);
	int dmu_sync(struct zio zio, uint64_t txg, dmu_sync_cb_t done, zgd_t *zgd);

	/*
	* Find the next hole or data block in file starting at *off
	* Return found offset in *off. Return ESRCH for end of file.
	*/
	int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
	uint64_t *off);

	/*
	* Initial setup and final teardown.
	*/
	extern void dmu_init(void);
	extern void dmu_fini(void);

	typedef void (dmu_traverse_cb_t)(objset_t os, void arg, struct blkptr bp,
	uint64_t object, uint64_t offset, int len);
	void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
	dmu_traverse_cb_t cb, void *arg);

	int dmu_diff(const char tosnap_name, const char fromsnap_name,
	struct vnode vp, offset_t offp);

	/* CRC64 table */
	#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
	extern uint64_t zfs_crc64_table[256];

	extern int zfs_mdcomp_disable;

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DMU_H */
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_objset.h
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_objset.h (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_objset.h (revision 284762)
	@@ -1,186 +1,188 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#ifndef _SYS_DMU_OBJSET_H
	#define _SYS_DMU_OBJSET_H

	#include <sys/spa.h>
	#include <sys/arc.h>
	#include <sys/txg.h>
	#include <sys/zfs_context.h>
	#include <sys/dnode.h>
	#include <sys/zio.h>
	#include <sys/zil.h>
	#include <sys/sa.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	extern krwlock_t os_lock;

	struct dsl_pool;
	struct dsl_dataset;
	struct dmu_tx;

	#define OBJSET_PHYS_SIZE 2048
	#define OBJSET_OLD_PHYS_SIZE 1024

	#define OBJSET_BUF_HAS_USERUSED(buf) \
	(arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)

	#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)

	typedef struct objset_phys {
	dnode_phys_t os_meta_dnode;
	zil_header_t os_zil_header;
	uint64_t os_type;
	uint64_t os_flags;
	char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
	sizeof (zil_header_t) - sizeof (uint64_t)*2];
	dnode_phys_t os_userused_dnode;
	dnode_phys_t os_groupused_dnode;
	} objset_phys_t;

	struct objset {
	/* Immutable: */
	struct dsl_dataset *os_dsl_dataset;
	spa_t *os_spa;
	arc_buf_t *os_phys_buf;
	objset_phys_t *os_phys;
	/*
	* The following "special" dnodes have no parent, are exempt
	* from dnode_move(), and are not recorded in os_dnodes, but they
	* root their descendents in this objset using handles anyway, so
	* that all access to dnodes from dbufs consistently uses handles.
	*/
	dnode_handle_t os_meta_dnode;
	dnode_handle_t os_userused_dnode;
	dnode_handle_t os_groupused_dnode;
	zilog_t *os_zil;

	list_node_t os_evicting_node;

	/* can change, under dsl_dir's locks: */
	enum zio_checksum os_checksum;
	enum zio_compress os_compress;
	uint8_t os_copies;
	enum zio_checksum os_dedup_checksum;
	boolean_t os_dedup_verify;
	boolean_t os_evicting;
	zfs_logbias_op_t os_logbias;
	zfs_cache_type_t os_primary_cache;
	zfs_cache_type_t os_secondary_cache;
	zfs_sync_type_t os_sync;
	zfs_redundant_metadata_type_t os_redundant_metadata;
	int os_recordsize;

	/* no lock needed: */
	struct dmu_tx os_synctx; / XXX sketchy */
	blkptr_t *os_rootbp;
	zil_header_t os_zil_header;
	list_t os_synced_dnodes;
	uint64_t os_flags;

	/* Protected by os_obj_lock */
	kmutex_t os_obj_lock;
	uint64_t os_obj_next;

	/* Protected by os_lock */
	kmutex_t os_lock;
	list_t os_dirty_dnodes[TXG_SIZE];
	list_t os_free_dnodes[TXG_SIZE];
	list_t os_dnodes;
	list_t os_downgraded_dbufs;

	/* stuff we store for the user */
	kmutex_t os_user_ptr_lock;
	void *os_user_ptr;
	sa_os_t *os_sa;
	};

	#define DMU_META_OBJSET 0
	#define DMU_META_DNODE_OBJECT 0
	#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
	#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode)
	#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode)
	#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode)

	#define DMU_OS_IS_L2CACHEABLE(os) \
	((os)->os_secondary_cache == ZFS_CACHE_ALL \|\| \
	(os)->os_secondary_cache == ZFS_CACHE_METADATA)

	#define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE)

	/* called from zpl */
	int dmu_objset_hold(const char name, void tag, objset_t **osp);
	int dmu_objset_own(const char *name, dmu_objset_type_t type,
	boolean_t readonly, void tag, objset_t *osp);
	+int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
	+ dmu_objset_type_t type, boolean_t readonly, void tag, objset_t *osp);
	void dmu_objset_refresh_ownership(objset_t os, void tag);
	void dmu_objset_rele(objset_t os, void tag);
	void dmu_objset_disown(objset_t os, void tag);
	int dmu_objset_from_ds(struct dsl_dataset ds, objset_t *osp);

	void dmu_objset_stats(objset_t os, nvlist_t nv);
	void dmu_objset_fast_stat(objset_t os, dmu_objset_stats_t stat);
	void dmu_objset_space(objset_t os, uint64_t refdbytesp, uint64_t *availbytesp,
	uint64_t usedobjsp, uint64_t availobjsp);
	uint64_t dmu_objset_fsid_guid(objset_t *os);
	int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
	int func(struct dsl_pool , struct dsl_dataset , void *),
	void *arg, int flags);
	int dmu_objset_prefetch(const char name, void arg);
	void dmu_objset_evict_dbufs(objset_t *os);
	timestruc_t dmu_objset_snap_cmtime(objset_t *os);

	/* called from dsl */
	void dmu_objset_sync(objset_t os, zio_t zio, dmu_tx_t *tx);
	boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
	objset_t dmu_objset_create_impl(spa_t spa, struct dsl_dataset *ds,
	blkptr_t bp, dmu_objset_type_t type, dmu_tx_t tx);
	int dmu_objset_open_impl(spa_t spa, struct dsl_dataset ds, blkptr_t *bp,
	objset_t **osp);
	void dmu_objset_evict(objset_t *os);
	void dmu_objset_do_userquota_updates(objset_t os, dmu_tx_t tx);
	void dmu_objset_userquota_get_ids(dnode_t dn, boolean_t before, dmu_tx_t tx);
	boolean_t dmu_objset_userused_enabled(objset_t *os);
	int dmu_objset_userspace_upgrade(objset_t *os);
	boolean_t dmu_objset_userspace_present(objset_t *os);
	int dmu_fsname(const char snapname, char buf);

	void dmu_objset_evict_done(objset_t *os);

	void dmu_objset_init(void);
	void dmu_objset_fini(void);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DMU_OBJSET_H */
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_pool.h
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_pool.h (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_pool.h (revision 284762)
	@@ -1,174 +1,175 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2013 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_DSL_POOL_H
	#define _SYS_DSL_POOL_H

	#include <sys/spa.h>
	#include <sys/txg.h>
	#include <sys/txg_impl.h>
	#include <sys/zfs_context.h>
	#include <sys/zio.h>
	#include <sys/dnode.h>
	#include <sys/ddt.h>
	#include <sys/arc.h>
	#include <sys/bpobj.h>
	#include <sys/bptree.h>
	#include <sys/rrwlock.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	struct objset;
	struct dsl_dir;
	struct dsl_dataset;
	struct dsl_pool;
	struct dmu_tx;
	struct dsl_scan;

	extern uint64_t zfs_dirty_data_max;
	extern uint64_t zfs_dirty_data_max_max;
	extern uint64_t zfs_dirty_data_sync;
	extern int zfs_dirty_data_max_percent;
	extern int zfs_delay_min_dirty_percent;
	extern uint64_t zfs_delay_scale;

	/* These macros are for indexing into the zfs_all_blkstats_t. */
	#define DMU_OT_DEFERRED DMU_OT_NONE
	#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
	#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1)

	typedef struct zfs_blkstat {
	uint64_t zb_count;
	uint64_t zb_asize;
	uint64_t zb_lsize;
	uint64_t zb_psize;
	uint64_t zb_gangs;
	uint64_t zb_ditto_2_of_2_samevdev;
	uint64_t zb_ditto_2_of_3_samevdev;
	uint64_t zb_ditto_3_of_3_samevdev;
	} zfs_blkstat_t;

	typedef struct zfs_all_blkstats {
	zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
	} zfs_all_blkstats_t;


	typedef struct dsl_pool {
	/* Immutable */
	spa_t *dp_spa;
	struct objset *dp_meta_objset;
	struct dsl_dir *dp_root_dir;
	struct dsl_dir *dp_mos_dir;
	struct dsl_dir *dp_free_dir;
	struct dsl_dir *dp_leak_dir;
	struct dsl_dataset *dp_origin_snap;
	uint64_t dp_root_dir_obj;
	struct taskq *dp_vnrele_taskq;

	/* No lock needed - sync context only */
	blkptr_t dp_meta_rootbp;
	uint64_t dp_tmp_userrefs_obj;
	bpobj_t dp_free_bpobj;
	uint64_t dp_bptree_obj;
	uint64_t dp_empty_bpobj;

	struct dsl_scan *dp_scan;

	/* Uses dp_lock */
	kmutex_t dp_lock;
	kcondvar_t dp_spaceavail_cv;
	uint64_t dp_dirty_pertxg[TXG_SIZE];
	uint64_t dp_dirty_total;
	uint64_t dp_mos_used_delta;
	uint64_t dp_mos_compressed_delta;
	uint64_t dp_mos_uncompressed_delta;

	/*
	* Time of most recently scheduled (furthest in the future)
	* wakeup for delayed transactions.
	*/
	hrtime_t dp_last_wakeup;

	/* Has its own locking */
	tx_state_t dp_tx;
	txg_list_t dp_dirty_datasets;
	txg_list_t dp_dirty_zilogs;
	txg_list_t dp_dirty_dirs;
	txg_list_t dp_sync_tasks;

	/*
	* Protects administrative changes (properties, namespace)
	*
	* It is only held for write in syncing context. Therefore
	* syncing context does not need to ever have it for read, since
	* nobody else could possibly have it for write.
	*/
	rrwlock_t dp_config_rwlock;

	zfs_all_blkstats_t *dp_blkstats;
	} dsl_pool_t;

	int dsl_pool_init(spa_t spa, uint64_t txg, dsl_pool_t *dpp);
	int dsl_pool_open(dsl_pool_t *dp);
	void dsl_pool_close(dsl_pool_t *dp);
	dsl_pool_t dsl_pool_create(spa_t spa, nvlist_t *zplprops, uint64_t txg);
	void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
	void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
	int dsl_pool_sync_context(dsl_pool_t *dp);
	uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
	uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
	void dsl_pool_dirty_space(dsl_pool_t dp, int64_t space, dmu_tx_t tx);
	void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
	void dsl_free(dsl_pool_t dp, uint64_t txg, const blkptr_t bpp);
	void dsl_free_sync(zio_t pio, dsl_pool_t dp, uint64_t txg,
	const blkptr_t *bpp);
	void dsl_pool_create_origin(dsl_pool_t dp, dmu_tx_t tx);
	void dsl_pool_upgrade_clones(dsl_pool_t dp, dmu_tx_t tx);
	void dsl_pool_upgrade_dir_clones(dsl_pool_t dp, dmu_tx_t tx);
	void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
	int64_t used, int64_t comp, int64_t uncomp);
	void dsl_pool_config_enter(dsl_pool_t dp, void tag);
	void dsl_pool_config_exit(dsl_pool_t dp, void tag);
	boolean_t dsl_pool_config_held(dsl_pool_t *dp);
	+boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
	boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);

	taskq_t dsl_pool_vnrele_taskq(dsl_pool_t dp);

	int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
	const char tag, uint64_t now, dmu_tx_t tx);
	int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
	const char tag, dmu_tx_t tx);
	void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
	int dsl_pool_open_special_dir(dsl_pool_t dp, const char name, dsl_dir_t **);
	int dsl_pool_hold(const char name, void tag, dsl_pool_t **dp);
	void dsl_pool_rele(dsl_pool_t dp, void tag);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_DSL_POOL_H */
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h (revision 284762)
	@@ -1,164 +1,165 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2013 by Delphix. All rights reserved.
	*/

	#ifndef _SYS_VDEV_H
	#define _SYS_VDEV_H

	#include <sys/spa.h>
	#include <sys/zio.h>
	#include <sys/dmu.h>
	#include <sys/space_map.h>
	#include <sys/fs/zfs.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	typedef enum vdev_dtl_type {
	DTL_MISSING, /* 0% replication: no copies of the data */
	DTL_PARTIAL, /* less than 100% replication: some copies missing */
	DTL_SCRUB, /* unable to fully repair during scrub/resilver */
	DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
	DTL_TYPES
	} vdev_dtl_type_t;

	extern boolean_t zfs_nocacheflush;

	extern int vdev_open(vdev_t *);
	extern void vdev_open_children(vdev_t *);
	extern boolean_t vdev_uses_zvols(vdev_t *);
	extern int vdev_validate(vdev_t *, boolean_t);
	extern void vdev_close(vdev_t *);
	extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
	extern void vdev_reopen(vdev_t *);
	extern int vdev_validate_aux(vdev_t *vd);
	extern zio_t vdev_probe(vdev_t vd, zio_t *pio);

	extern boolean_t vdev_is_bootable(vdev_t *vd);
	extern vdev_t vdev_lookup_top(spa_t spa, uint64_t vdev);
	extern vdev_t vdev_lookup_by_guid(vdev_t vd, uint64_t guid);
	+extern int vdev_count_leaves(spa_t *spa);
	extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
	uint64_t txg, uint64_t size);
	extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
	uint64_t txg, uint64_t size);
	extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
	extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
	int scrub_done);
	extern boolean_t vdev_dtl_required(vdev_t *vd);
	extern boolean_t vdev_resilver_needed(vdev_t *vd,
	uint64_t minp, uint64_t maxp);

	extern void vdev_hold(vdev_t *);
	extern void vdev_rele(vdev_t *);

	extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
	extern void vdev_metaslab_fini(vdev_t *vd);
	extern void vdev_metaslab_set_size(vdev_t *);
	extern void vdev_expand(vdev_t *vd, uint64_t txg);
	extern void vdev_split(vdev_t *vd);
	extern void vdev_deadman(vdev_t *vd);


	extern void vdev_get_stats(vdev_t vd, vdev_stat_t vs);
	extern void vdev_clear_stats(vdev_t *vd);
	extern void vdev_stat_update(zio_t *zio, uint64_t psize);
	extern void vdev_scan_stat_init(vdev_t *vd);
	extern void vdev_propagate_state(vdev_t *vd);
	extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
	vdev_aux_t aux);

	extern void vdev_space_update(vdev_t *vd,
	int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);

	extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);

	extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
	extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
	extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
	vdev_state_t *);
	extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
	extern void vdev_clear(spa_t spa, vdev_t vd);

	extern boolean_t vdev_is_dead(vdev_t *vd);
	extern boolean_t vdev_readable(vdev_t *vd);
	extern boolean_t vdev_writeable(vdev_t *vd);
	extern boolean_t vdev_allocatable(vdev_t *vd);
	extern boolean_t vdev_accessible(vdev_t vd, zio_t zio);

	extern void vdev_cache_init(vdev_t *vd);
	extern void vdev_cache_fini(vdev_t *vd);
	extern boolean_t vdev_cache_read(zio_t *zio);
	extern void vdev_cache_write(zio_t *zio);
	extern void vdev_cache_purge(vdev_t *vd);

	extern void vdev_queue_init(vdev_t *vd);
	extern void vdev_queue_fini(vdev_t *vd);
	extern zio_t vdev_queue_io(zio_t zio);
	extern void vdev_queue_io_done(zio_t *zio);

	extern void vdev_config_dirty(vdev_t *vd);
	extern void vdev_config_clean(vdev_t *vd);
	extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
	boolean_t);

	extern void vdev_state_dirty(vdev_t *vd);
	extern void vdev_state_clean(vdev_t *vd);

	typedef enum vdev_config_flag {
	VDEV_CONFIG_SPARE = 1 << 0,
	VDEV_CONFIG_L2CACHE = 1 << 1,
	VDEV_CONFIG_REMOVING = 1 << 2
	} vdev_config_flag_t;

	extern void vdev_top_config_generate(spa_t spa, nvlist_t config);
	extern nvlist_t vdev_config_generate(spa_t spa, vdev_t *vd,
	boolean_t getstats, vdev_config_flag_t flags);

	/*
	* Label routines
	*/
	struct uberblock;
	extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
	extern int vdev_label_number(uint64_t psise, uint64_t offset);
	extern nvlist_t vdev_label_read_config(vdev_t vd, uint64_t txg);
	extern void vdev_uberblock_load(vdev_t , struct uberblock , nvlist_t **);

	typedef enum {
	VDEV_LABEL_CREATE, /* create/add a new device */
	VDEV_LABEL_REPLACE, /* replace an existing device */
	VDEV_LABEL_SPARE, /* add a new hot spare */
	VDEV_LABEL_REMOVE, /* remove an existing device */
	VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
	VDEV_LABEL_SPLIT /* generating new label for split-off dev */
	} vdev_labeltype_t;

	extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_VDEV_H */
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zil.h
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zil.h (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zil.h (revision 284762)
	@@ -1,428 +1,433 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2012 by Delphix. All rights reserved.
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#ifndef _SYS_ZIL_H
	#define _SYS_ZIL_H

	#include <sys/types.h>
	#include <sys/spa.h>
	#include <sys/zio.h>
	#include <sys/dmu.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	+struct dsl_pool;
	+struct dsl_dataset;
	+
	/*
	* Intent log format:
	*
	* Each objset has its own intent log. The log header (zil_header_t)
	* for objset N's intent log is kept in the Nth object of the SPA's
	* intent_log objset. The log header points to a chain of log blocks,
	* each of which contains log records (i.e., transactions) followed by
	* a log block trailer (zil_trailer_t). The format of a log record
	* depends on the record (or transaction) type, but all records begin
	* with a common structure that defines the type, length, and txg.
	*/

	/*
	* Intent log header - this on disk structure holds fields to manage
	* the log. All fields are 64 bit to easily handle cross architectures.
	*/
	typedef struct zil_header {
	uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
	uint64_t zh_replay_seq; /* highest replayed sequence number */
	blkptr_t zh_log; /* log chain */
	uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
	uint64_t zh_flags; /* header flags */
	uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
	uint64_t zh_pad[3];
	} zil_header_t;

	/*
	* zh_flags bit settings
	*/
	#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
	#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */

	/*
	* Log block chaining.
	*
	* Log blocks are chained together. Originally they were chained at the
	* end of the block. For performance reasons the chain was moved to the
	* beginning of the block which allows writes for only the data being used.
	* The older position is supported for backwards compatability.
	*
	* The zio_eck_t contains a zec_cksum which for the intent log is
	* the sequence number of this log block. A seq of 0 is invalid.
	* The zec_cksum is checked by the SPA against the sequence
	* number passed in the blk_cksum field of the blkptr_t
	*/
	typedef struct zil_chain {
	uint64_t zc_pad;
	blkptr_t zc_next_blk; /* next block in chain */
	uint64_t zc_nused; /* bytes in log block used */
	zio_eck_t zc_eck; /* block trailer */
	} zil_chain_t;

	#define ZIL_MIN_BLKSZ 4096ULL

	/*
	* The words of a log block checksum.
	*/
	#define ZIL_ZC_GUID_0 0
	#define ZIL_ZC_GUID_1 1
	#define ZIL_ZC_OBJSET 2
	#define ZIL_ZC_SEQ 3

	typedef enum zil_create {
	Z_FILE,
	Z_DIR,
	Z_XATTRDIR,
	} zil_create_t;

	/*
	* size of xvattr log section.
	* its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps
	* for create time and a single 64 bit integer for all of the attributes,
	* and 4 64 bit integers (32 bytes) for the scanstamp.
	*
	*/

	#define ZIL_XVAT_SIZE(mapsize) \
	sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \
	(sizeof (uint64_t) * 7)

	/*
	* Size of ACL in log. The ACE data is padded out to properly align
	* on 8 byte boundary.
	*/

	#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t)))

	/*
	* Intent log transaction types and record structures
	*/
	#define TX_CREATE 1 /* Create file */
	#define TX_MKDIR 2 /* Make directory */
	#define TX_MKXATTR 3 /* Make XATTR directory */
	#define TX_SYMLINK 4 /* Create symbolic link to a file */
	#define TX_REMOVE 5 /* Remove file */
	#define TX_RMDIR 6 /* Remove directory */
	#define TX_LINK 7 /* Create hard link to a file */
	#define TX_RENAME 8 /* Rename a file */
	#define TX_WRITE 9 /* File write */
	#define TX_TRUNCATE 10 /* Truncate a file */
	#define TX_SETATTR 11 /* Set file attributes */
	#define TX_ACL_V0 12 /* Set old formatted ACL */
	#define TX_ACL 13 /* Set ACL */
	#define TX_CREATE_ACL 14 /* create with ACL */
	#define TX_CREATE_ATTR 15 /* create + attrs */
	#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
	#define TX_MKDIR_ACL 17 /* mkdir with ACL */
	#define TX_MKDIR_ATTR 18 /* mkdir with attr */
	#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
	#define TX_WRITE2 20 /* dmu_sync EALREADY write */
	#define TX_MAX_TYPE 21 /* Max transaction type */

	/*
	* The transactions for mkdir, symlink, remove, rmdir, link, and rename
	* may have the following bit set, indicating the original request
	* specified case-insensitive handling of names.
	*/
	#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */

	/*
	* Transactions for write, truncate, setattr, acl_v0, and acl can be logged
	* out of order. For convenience in the code, all such records must have
	* lr_foid at the same offset.
	*/
	#define TX_OOO(txtype) \
	((txtype) == TX_WRITE \|\| \
	(txtype) == TX_TRUNCATE \|\| \
	(txtype) == TX_SETATTR \|\| \
	(txtype) == TX_ACL_V0 \|\| \
	(txtype) == TX_ACL \|\| \
	(txtype) == TX_WRITE2)

	/*
	* Format of log records.
	* The fields are carefully defined to allow them to be aligned
	* and sized the same on sparc & intel architectures.
	* Each log record has a common structure at the beginning.
	*
	* The log record on disk (lrc_seq) holds the sequence number of all log
	* records which is used to ensure we don't replay the same record.
	*/
	typedef struct { /* common log record header */
	uint64_t lrc_txtype; /* intent log transaction type */
	uint64_t lrc_reclen; /* transaction record length */
	uint64_t lrc_txg; /* dmu transaction group number */
	uint64_t lrc_seq; /* see comment above */
	} lr_t;

	/*
	* Common start of all out-of-order record types (TX_OOO() above).
	*/
	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* object id */
	} lr_ooo_t;

	/*
	* Handle option extended vattr attributes.
	*
	* Whenever new attributes are added the version number
	* will need to be updated as will code in
	* zfs_log.c and zfs_replay.c
	*/
	typedef struct {
	uint32_t lr_attr_masksize; /* number of elements in array */
	uint32_t lr_attr_bitmap; /* First entry of array */
	/* remainder of array and any additional fields */
	} lr_attr_t;

	/*
	* log record for creates without optional ACL.
	* This log record does support optional xvattr_t attributes.
	*/
	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_doid; /* object id of directory */
	uint64_t lr_foid; /* object id of created file object */
	uint64_t lr_mode; /* mode of object */
	uint64_t lr_uid; /* uid of object */
	uint64_t lr_gid; /* gid of object */
	uint64_t lr_gen; /* generation (txg of creation) */
	uint64_t lr_crtime[2]; /* creation time */
	uint64_t lr_rdev; /* rdev of object to create */
	/* name of object to create follows this */
	/* for symlinks, link content follows name */
	/* for creates with xvattr data, the name follows the xvattr info */
	} lr_create_t;

	/*
	* FUID ACL record will be an array of ACEs from the original ACL.
	* If this array includes ephemeral IDs, the record will also include
	* an array of log-specific FUIDs to replace the ephemeral IDs.
	* Only one copy of each unique domain will be present, so the log-specific
	* FUIDs will use an index into a compressed domain table. On replay this
	* information will be used to construct real FUIDs (and bypass idmap,
	* since it may not be available).
	*/

	/*
	* Log record for creates with optional ACL
	* This log record is also used for recording any FUID
	* information needed for replaying the create. If the
	* file doesn't have any actual ACEs then the lr_aclcnt
	* would be zero.
	*
	* After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
	* If create is also setting xvattr's, then acl data follows xvattr.
	* If ACE FUIDs are needed then they will follow the xvattr_t. Following
	* the FUIDs will be the domain table information. The FUIDs for the owner
	* and group will be in lr_create. Name follows ACL data.
	*/
	typedef struct {
	lr_create_t lr_create; /* common create portion */
	uint64_t lr_aclcnt; /* number of ACEs in ACL */
	uint64_t lr_domcnt; /* number of unique domains */
	uint64_t lr_fuidcnt; /* number of real fuids */
	uint64_t lr_acl_bytes; /* number of bytes in ACL */
	uint64_t lr_acl_flags; /* ACL flags */
	} lr_acl_create_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_doid; /* obj id of directory */
	/* name of object to remove follows this */
	} lr_remove_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_doid; /* obj id of directory */
	uint64_t lr_link_obj; /* obj id of link */
	/* name of object to link follows this */
	} lr_link_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_sdoid; /* obj id of source directory */
	uint64_t lr_tdoid; /* obj id of target directory */
	/* 2 strings: names of source and destination follow this */
	} lr_rename_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* file object to write */
	uint64_t lr_offset; /* offset to write to */
	uint64_t lr_length; /* user data length to write */
	uint64_t lr_blkoff; /* no longer used */
	blkptr_t lr_blkptr; /* spa block pointer for replay */
	/* write data will follow for small writes */
	} lr_write_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* object id of file to truncate */
	uint64_t lr_offset; /* offset to truncate from */
	uint64_t lr_length; /* length to truncate */
	} lr_truncate_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* file object to change attributes */
	uint64_t lr_mask; /* mask of attributes to set */
	uint64_t lr_mode; /* mode to set */
	uint64_t lr_uid; /* uid to set */
	uint64_t lr_gid; /* gid to set */
	uint64_t lr_size; /* size to set */
	uint64_t lr_atime[2]; /* access time */
	uint64_t lr_mtime[2]; /* modification time */
	/* optional attribute lr_attr_t may be here */
	} lr_setattr_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* obj id of file */
	uint64_t lr_aclcnt; /* number of acl entries */
	/* lr_aclcnt number of ace_t entries follow this */
	} lr_acl_v0_t;

	typedef struct {
	lr_t lr_common; /* common portion of log record */
	uint64_t lr_foid; /* obj id of file */
	uint64_t lr_aclcnt; /* number of ACEs in ACL */
	uint64_t lr_domcnt; /* number of unique domains */
	uint64_t lr_fuidcnt; /* number of real fuids */
	uint64_t lr_acl_bytes; /* number of bytes in ACL */
	uint64_t lr_acl_flags; /* ACL flags */
	/* lr_acl_bytes number of variable sized ace's follows */
	} lr_acl_t;

	/*
	* ZIL structure definitions, interface function prototype and globals.
	*/

	/*
	* Writes are handled in three different ways:
	*
	* WR_INDIRECT:
	* In this mode, if we need to commit the write later, then the block
	* is immediately written into the file system (using dmu_sync),
	* and a pointer to the block is put into the log record.
	* When the txg commits the block is linked in.
	* This saves additionally writing the data into the log record.
	* There are a few requirements for this to occur:
	* - write is greater than zfs/zvol_immediate_write_sz
	* - not using slogs (as slogs are assumed to always be faster
	* than writing into the main pool)
	* - the write occupies only one block
	* WR_COPIED:
	* If we know we'll immediately be committing the
	* transaction (FSYNC or FDSYNC), the we allocate a larger
	* log record here for the data and copy the data in.
	* WR_NEED_COPY:
	* Otherwise we don't allocate a buffer, and if we need to
	* flush the write later then a buffer is allocated and
	* we retrieve the data using the dmu.
	*/
	typedef enum {
	WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
	/* and put blkptr in log, rather than actual data) */
	WR_COPIED, /* immediate - data is copied into lr_write_t */
	WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
	WR_NUM_STATES /* number of states */
	} itx_wr_state_t;

	typedef struct itx {
	list_node_t itx_node; /* linkage on zl_itx_list */
	void itx_private; / type-specific opaque data */
	itx_wr_state_t itx_wr_state; /* write state */
	uint8_t itx_sync; /* synchronous transaction */
	uint64_t itx_sod; /* record size on disk */
	uint64_t itx_oid; /* object id */
	lr_t itx_lr; /* common part of log record */
	/* followed by type-specific part of lr_xx_t and its immediate data */
	} itx_t;

	typedef int zil_parse_blk_func_t(zilog_t zilog, blkptr_t bp, void *arg,
	uint64_t txg);
	typedef int zil_parse_lr_func_t(zilog_t zilog, lr_t lr, void *arg,
	uint64_t txg);
	typedef int zil_replay_func_t();
	typedef int zil_get_data_t(void arg, lr_write_t lr, char dbuf, zio_t zio);

	extern int zil_parse(zilog_t zilog, zil_parse_blk_func_t parse_blk_func,
	zil_parse_lr_func_t parse_lr_func, void arg, uint64_t txg);

	extern void zil_init(void);
	extern void zil_fini(void);

	extern zilog_t zil_alloc(objset_t os, zil_header_t *zh_phys);
	extern void zil_free(zilog_t *zilog);

	extern zilog_t zil_open(objset_t os, zil_get_data_t *get_data);
	extern void zil_close(zilog_t *zilog);

	extern void zil_replay(objset_t os, void arg,
	zil_replay_func_t *replay_func[TX_MAX_TYPE]);
	extern boolean_t zil_replaying(zilog_t zilog, dmu_tx_t tx);
	extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
	extern void zil_destroy_sync(zilog_t zilog, dmu_tx_t tx);
	extern void zil_rollback_destroy(zilog_t zilog, dmu_tx_t tx);

	extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
	extern void zil_itx_destroy(itx_t *itx);
	extern void zil_itx_assign(zilog_t zilog, itx_t itx, dmu_tx_t *tx);

	extern void zil_commit(zilog_t *zilog, uint64_t oid);

	extern int zil_vdev_offline(const char osname, void txarg);
	-extern int zil_claim(const char osname, void txarg);
	-extern int zil_check_log_chain(const char osname, void txarg);
	+extern int zil_claim(struct dsl_pool *dp,
	+ struct dsl_dataset ds, void txarg);
	+extern int zil_check_log_chain(struct dsl_pool *dp,
	+ struct dsl_dataset ds, void tx);
	extern void zil_sync(zilog_t zilog, dmu_tx_t tx);
	extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);

	extern int zil_suspend(const char osname, void *cookiep);
	extern void zil_resume(void *cookie);

	extern void zil_add_block(zilog_t zilog, const blkptr_t bp);
	extern int zil_bp_tree_add(zilog_t zilog, const blkptr_t bp);

	extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);

	extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);

	extern int zil_replay_disable;

	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_ZIL_H */
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c (revision 284762)
	@@ -1,3348 +1,3368 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
	* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
	*/

	#include <sys/zfs_context.h>
	#include <sys/fm/fs/zfs.h>
	#include <sys/spa.h>
	#include <sys/spa_impl.h>
	#include <sys/dmu.h>
	#include <sys/dmu_tx.h>
	#include <sys/vdev_impl.h>
	#include <sys/uberblock_impl.h>
	#include <sys/metaslab.h>
	#include <sys/metaslab_impl.h>
	#include <sys/space_map.h>
	#include <sys/space_reftree.h>
	#include <sys/zio.h>
	#include <sys/zap.h>
	#include <sys/fs/zfs.h>
	#include <sys/arc.h>
	#include <sys/zil.h>
	#include <sys/dsl_scan.h>

	/*
	* Virtual device management.
	*/

	static vdev_ops_t *vdev_ops_table[] = {
	&vdev_root_ops,
	&vdev_raidz_ops,
	&vdev_mirror_ops,
	&vdev_replacing_ops,
	&vdev_spare_ops,
	&vdev_disk_ops,
	&vdev_file_ops,
	&vdev_missing_ops,
	&vdev_hole_ops,
	NULL
	};

	/* maximum scrub/resilver I/O queue per leaf vdev */
	int zfs_scrub_limit = 10;

	/*
	* When a vdev is added, it will be divided into approximately (but no
	* more than) this number of metaslabs.
	*/
	int metaslabs_per_vdev = 200;

	/*
	* Given a vdev type, return the appropriate ops vector.
	*/
	static vdev_ops_t *
	vdev_getops(const char *type)
	{
	vdev_ops_t ops, *opspp;

	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
	if (strcmp(ops->vdev_op_type, type) == 0)
	break;

	return (ops);
	}

	/*
	* Default asize function: return the MAX of psize with the asize of
	* all children. This is what's used by anything other than RAID-Z.
	*/
	uint64_t
	vdev_default_asize(vdev_t *vd, uint64_t psize)
	{
	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
	uint64_t csize;

	for (int c = 0; c < vd->vdev_children; c++) {
	csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
	asize = MAX(asize, csize);
	}

	return (asize);
	}

	/*
	* Get the minimum allocatable size. We define the allocatable size as
	* the vdev's asize rounded to the nearest metaslab. This allows us to
	* replace or attach devices which don't have the same physical size but
	* can still satisfy the same number of allocations.
	*/
	uint64_t
	vdev_get_min_asize(vdev_t *vd)
	{
	vdev_t *pvd = vd->vdev_parent;

	/*
	* If our parent is NULL (inactive spare or cache) or is the root,
	* just return our own asize.
	*/
	if (pvd == NULL)
	return (vd->vdev_asize);

	/*
	* The top-level vdev just returns the allocatable size rounded
	* to the nearest metaslab.
	*/
	if (vd == vd->vdev_top)
	return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));

	/*
	* The allocatable space for a raidz vdev is N * sizeof(smallest child),
	* so each child must provide at least 1/Nth of its asize.
	*/
	if (pvd->vdev_ops == &vdev_raidz_ops)
	return (pvd->vdev_min_asize / pvd->vdev_children);

	return (pvd->vdev_min_asize);
	}

	void
	vdev_set_min_asize(vdev_t *vd)
	{
	vd->vdev_min_asize = vdev_get_min_asize(vd);

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_set_min_asize(vd->vdev_child[c]);
	}

	vdev_t *
	vdev_lookup_top(spa_t *spa, uint64_t vdev)
	{
	vdev_t *rvd = spa->spa_root_vdev;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

	if (vdev < rvd->vdev_children) {
	ASSERT(rvd->vdev_child[vdev] != NULL);
	return (rvd->vdev_child[vdev]);
	}

	return (NULL);
	}

	vdev_t *
	vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
	{
	vdev_t *mvd;

	if (vd->vdev_guid == guid)
	return (vd);

	for (int c = 0; c < vd->vdev_children; c++)
	if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
	NULL)
	return (mvd);

	return (NULL);
	}

	+static int
	+vdev_count_leaves_impl(vdev_t *vd)
	+{
	+ int n = 0;
	+
	+ if (vd->vdev_ops->vdev_op_leaf)
	+ return (1);
	+
	+ for (int c = 0; c < vd->vdev_children; c++)
	+ n += vdev_count_leaves_impl(vd->vdev_child[c]);
	+
	+ return (n);
	+}
	+
	+int
	+vdev_count_leaves(spa_t *spa)
	+{
	+ return (vdev_count_leaves_impl(spa->spa_root_vdev));
	+}
	+
	void
	vdev_add_child(vdev_t pvd, vdev_t cvd)
	{
	size_t oldsize, newsize;
	uint64_t id = cvd->vdev_id;
	vdev_t **newchild;
	spa_t *spa = cvd->vdev_spa;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
	ASSERT(cvd->vdev_parent == NULL);

	cvd->vdev_parent = pvd;

	if (pvd == NULL)
	return;

	ASSERT(id >= pvd->vdev_children \|\| pvd->vdev_child[id] == NULL);

	oldsize = pvd->vdev_children * sizeof (vdev_t *);
	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
	newsize = pvd->vdev_children * sizeof (vdev_t *);

	newchild = kmem_zalloc(newsize, KM_SLEEP);
	if (pvd->vdev_child != NULL) {
	bcopy(pvd->vdev_child, newchild, oldsize);
	kmem_free(pvd->vdev_child, oldsize);
	}

	pvd->vdev_child = newchild;
	pvd->vdev_child[id] = cvd;

	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);

	/*
	* Walk up all ancestors to update guid sum.
	*/
	for (; pvd != NULL; pvd = pvd->vdev_parent)
	pvd->vdev_guid_sum += cvd->vdev_guid_sum;
	}

	void
	vdev_remove_child(vdev_t pvd, vdev_t cvd)
	{
	int c;
	uint_t id = cvd->vdev_id;

	ASSERT(cvd->vdev_parent == pvd);

	if (pvd == NULL)
	return;

	ASSERT(id < pvd->vdev_children);
	ASSERT(pvd->vdev_child[id] == cvd);

	pvd->vdev_child[id] = NULL;
	cvd->vdev_parent = NULL;

	for (c = 0; c < pvd->vdev_children; c++)
	if (pvd->vdev_child[c])
	break;

	if (c == pvd->vdev_children) {
	kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
	pvd->vdev_child = NULL;
	pvd->vdev_children = 0;
	}

	/*
	* Walk up all ancestors to update guid sum.
	*/
	for (; pvd != NULL; pvd = pvd->vdev_parent)
	pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
	}

	/*
	* Remove any holes in the child array.
	*/
	void
	vdev_compact_children(vdev_t *pvd)
	{
	vdev_t *newchild, cvd;
	int oldc = pvd->vdev_children;
	int newc;

	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	for (int c = newc = 0; c < oldc; c++)
	if (pvd->vdev_child[c])
	newc++;

	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);

	for (int c = newc = 0; c < oldc; c++) {
	if ((cvd = pvd->vdev_child[c]) != NULL) {
	newchild[newc] = cvd;
	cvd->vdev_id = newc++;
	}
	}

	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
	pvd->vdev_child = newchild;
	pvd->vdev_children = newc;
	}

	/*
	* Allocate and minimally initialize a vdev_t.
	*/
	vdev_t *
	vdev_alloc_common(spa_t spa, uint_t id, uint64_t guid, vdev_ops_t ops)
	{
	vdev_t *vd;

	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);

	if (spa->spa_root_vdev == NULL) {
	ASSERT(ops == &vdev_root_ops);
	spa->spa_root_vdev = vd;
	spa->spa_load_guid = spa_generate_guid(NULL);
	}

	if (guid == 0 && ops != &vdev_hole_ops) {
	if (spa->spa_root_vdev == vd) {
	/*
	* The root vdev's guid will also be the pool guid,
	* which must be unique among all pools.
	*/
	guid = spa_generate_guid(NULL);
	} else {
	/*
	* Any other vdev's guid must be unique within the pool.
	*/
	guid = spa_generate_guid(spa);
	}
	ASSERT(!spa_guid_exists(spa_guid(spa), guid));
	}

	vd->vdev_spa = spa;
	vd->vdev_id = id;
	vd->vdev_guid = guid;
	vd->vdev_guid_sum = guid;
	vd->vdev_ops = ops;
	vd->vdev_state = VDEV_STATE_CLOSED;
	vd->vdev_ishole = (ops == &vdev_hole_ops);

	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
	for (int t = 0; t < DTL_TYPES; t++) {
	vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
	&vd->vdev_dtl_lock);
	}
	txg_list_create(&vd->vdev_ms_list,
	offsetof(struct metaslab, ms_txg_node));
	txg_list_create(&vd->vdev_dtl_list,
	offsetof(struct vdev, vdev_dtl_node));
	vd->vdev_stat.vs_timestamp = gethrtime();
	vdev_queue_init(vd);
	vdev_cache_init(vd);

	return (vd);
	}

	/*
	* Allocate a new vdev. The 'alloctype' is used to control whether we are
	* creating a new vdev or loading an existing one - the behavior is slightly
	* different for each case.
	*/
	int
	vdev_alloc(spa_t spa, vdev_t vdp, nvlist_t nv, vdev_t *parent, uint_t id,
	int alloctype)
	{
	vdev_ops_t *ops;
	char *type;
	uint64_t guid = 0, islog, nparity;
	vdev_t *vd;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
	return (SET_ERROR(EINVAL));

	if ((ops = vdev_getops(type)) == NULL)
	return (SET_ERROR(EINVAL));

	/*
	* If this is a load, get the vdev guid from the nvlist.
	* Otherwise, vdev_alloc_common() will generate one for us.
	*/
	if (alloctype == VDEV_ALLOC_LOAD) {
	uint64_t label_id;

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) \|\|
	label_id != id)
	return (SET_ERROR(EINVAL));

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	return (SET_ERROR(EINVAL));
	} else if (alloctype == VDEV_ALLOC_SPARE) {
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	return (SET_ERROR(EINVAL));
	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	return (SET_ERROR(EINVAL));
	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
	return (SET_ERROR(EINVAL));
	}

	/*
	* The first allocated vdev must be of type 'root'.
	*/
	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
	return (SET_ERROR(EINVAL));

	/*
	* Determine whether we're a log vdev.
	*/
	islog = 0;
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
	return (SET_ERROR(ENOTSUP));

	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
	return (SET_ERROR(ENOTSUP));

	/*
	* Set the nparity property for RAID-Z vdevs.
	*/
	nparity = -1ULL;
	if (ops == &vdev_raidz_ops) {
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
	&nparity) == 0) {
	if (nparity == 0 \|\| nparity > VDEV_RAIDZ_MAXPARITY)
	return (SET_ERROR(EINVAL));
	/*
	* Previous versions could only support 1 or 2 parity
	* device.
	*/
	if (nparity > 1 &&
	spa_version(spa) < SPA_VERSION_RAIDZ2)
	return (SET_ERROR(ENOTSUP));
	if (nparity > 2 &&
	spa_version(spa) < SPA_VERSION_RAIDZ3)
	return (SET_ERROR(ENOTSUP));
	} else {
	/*
	* We require the parity to be specified for SPAs that
	* support multiple parity levels.
	*/
	if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
	return (SET_ERROR(EINVAL));
	/*
	* Otherwise, we default to 1 parity device for RAID-Z.
	*/
	nparity = 1;
	}
	} else {
	nparity = 0;
	}
	ASSERT(nparity != -1ULL);

	vd = vdev_alloc_common(spa, id, guid, ops);

	vd->vdev_islog = islog;
	vd->vdev_nparity = nparity;

	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
	vd->vdev_path = spa_strdup(vd->vdev_path);
	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
	vd->vdev_devid = spa_strdup(vd->vdev_devid);
	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
	&vd->vdev_physpath) == 0)
	vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
	vd->vdev_fru = spa_strdup(vd->vdev_fru);

	/*
	* Set the whole_disk property. If it's not specified, leave the value
	* as -1.
	*/
	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	&vd->vdev_wholedisk) != 0)
	vd->vdev_wholedisk = -1ULL;

	/*
	* Look for the 'not present' flag. This will only be set if the device
	* was not present at the time of import.
	*/
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
	&vd->vdev_not_present);

	/*
	* Get the alignment requirement.
	*/
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);

	/*
	* Retrieve the vdev creation time.
	*/
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
	&vd->vdev_crtxg);

	/*
	* If we're a top-level vdev, try to load the allocation parameters.
	*/
	if (parent && !parent->vdev_parent &&
	(alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_SPLIT)) {
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
	&vd->vdev_ms_array);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
	&vd->vdev_ms_shift);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
	&vd->vdev_asize);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
	&vd->vdev_removing);
	}

	if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
	ASSERT(alloctype == VDEV_ALLOC_LOAD \|\|
	alloctype == VDEV_ALLOC_ADD \|\|
	alloctype == VDEV_ALLOC_SPLIT \|\|
	alloctype == VDEV_ALLOC_ROOTPOOL);
	vd->vdev_mg = metaslab_group_create(islog ?
	spa_log_class(spa) : spa_normal_class(spa), vd);
	}

	/*
	* If we're a leaf vdev, try to load the DTL object and other state.
	*/
	if (vd->vdev_ops->vdev_op_leaf &&
	(alloctype == VDEV_ALLOC_LOAD \|\| alloctype == VDEV_ALLOC_L2CACHE \|\|
	alloctype == VDEV_ALLOC_ROOTPOOL)) {
	if (alloctype == VDEV_ALLOC_LOAD) {
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
	&vd->vdev_dtl_object);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
	&vd->vdev_unspare);
	}

	if (alloctype == VDEV_ALLOC_ROOTPOOL) {
	uint64_t spare = 0;

	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
	&spare) == 0 && spare)
	spa_spare_add(vd);
	}

	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
	&vd->vdev_offline);

	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
	&vd->vdev_resilver_txg);

	/*
	* When importing a pool, we want to ignore the persistent fault
	* state, as the diagnosis made on another system may not be
	* valid in the current context. Local vdevs will
	* remain in the faulted state.
	*/
	if (spa_load_state(spa) == SPA_LOAD_OPEN) {
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
	&vd->vdev_faulted);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
	&vd->vdev_degraded);
	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
	&vd->vdev_removed);

	if (vd->vdev_faulted \|\| vd->vdev_degraded) {
	char *aux;

	vd->vdev_label_aux =
	VDEV_AUX_ERR_EXCEEDED;
	if (nvlist_lookup_string(nv,
	ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
	strcmp(aux, "external") == 0)
	vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
	}
	}
	}

	/*
	* Add ourselves to the parent's list of children.
	*/
	vdev_add_child(parent, vd);

	*vdp = vd;

	return (0);
	}

	void
	vdev_free(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	/*
	* vdev_free() implies closing the vdev first. This is simpler than
	* trying to ensure complicated semantics for all callers.
	*/
	vdev_close(vd);

	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));

	/*
	* Free all children.
	*/
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_free(vd->vdev_child[c]);

	ASSERT(vd->vdev_child == NULL);
	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);

	/*
	* Discard allocation state.
	*/
	if (vd->vdev_mg != NULL) {
	vdev_metaslab_fini(vd);
	metaslab_group_destroy(vd->vdev_mg);
	}

	ASSERT0(vd->vdev_stat.vs_space);
	ASSERT0(vd->vdev_stat.vs_dspace);
	ASSERT0(vd->vdev_stat.vs_alloc);

	/*
	* Remove this vdev from its parent's child list.
	*/
	vdev_remove_child(vd->vdev_parent, vd);

	ASSERT(vd->vdev_parent == NULL);

	/*
	* Clean up vdev structure.
	*/
	vdev_queue_fini(vd);
	vdev_cache_fini(vd);

	if (vd->vdev_path)
	spa_strfree(vd->vdev_path);
	if (vd->vdev_devid)
	spa_strfree(vd->vdev_devid);
	if (vd->vdev_physpath)
	spa_strfree(vd->vdev_physpath);
	if (vd->vdev_fru)
	spa_strfree(vd->vdev_fru);

	if (vd->vdev_isspare)
	spa_spare_remove(vd);
	if (vd->vdev_isl2cache)
	spa_l2cache_remove(vd);

	txg_list_destroy(&vd->vdev_ms_list);
	txg_list_destroy(&vd->vdev_dtl_list);

	mutex_enter(&vd->vdev_dtl_lock);
	space_map_close(vd->vdev_dtl_sm);
	for (int t = 0; t < DTL_TYPES; t++) {
	range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
	range_tree_destroy(vd->vdev_dtl[t]);
	}
	mutex_exit(&vd->vdev_dtl_lock);

	mutex_destroy(&vd->vdev_dtl_lock);
	mutex_destroy(&vd->vdev_stat_lock);
	mutex_destroy(&vd->vdev_probe_lock);

	if (vd == spa->spa_root_vdev)
	spa->spa_root_vdev = NULL;

	kmem_free(vd, sizeof (vdev_t));
	}

	/*
	* Transfer top-level vdev state from svd to tvd.
	*/
	static void
	vdev_top_transfer(vdev_t svd, vdev_t tvd)
	{
	spa_t *spa = svd->vdev_spa;
	metaslab_t *msp;
	vdev_t *vd;
	int t;

	ASSERT(tvd == tvd->vdev_top);

	tvd->vdev_ms_array = svd->vdev_ms_array;
	tvd->vdev_ms_shift = svd->vdev_ms_shift;
	tvd->vdev_ms_count = svd->vdev_ms_count;

	svd->vdev_ms_array = 0;
	svd->vdev_ms_shift = 0;
	svd->vdev_ms_count = 0;

	if (tvd->vdev_mg)
	ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
	tvd->vdev_mg = svd->vdev_mg;
	tvd->vdev_ms = svd->vdev_ms;

	svd->vdev_mg = NULL;
	svd->vdev_ms = NULL;

	if (tvd->vdev_mg != NULL)
	tvd->vdev_mg->mg_vd = tvd;

	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;

	svd->vdev_stat.vs_alloc = 0;
	svd->vdev_stat.vs_space = 0;
	svd->vdev_stat.vs_dspace = 0;

	for (t = 0; t < TXG_SIZE; t++) {
	while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
	(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
	while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
	(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
	if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
	(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
	}

	if (list_link_active(&svd->vdev_config_dirty_node)) {
	vdev_config_clean(svd);
	vdev_config_dirty(tvd);
	}

	if (list_link_active(&svd->vdev_state_dirty_node)) {
	vdev_state_clean(svd);
	vdev_state_dirty(tvd);
	}

	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
	svd->vdev_deflate_ratio = 0;

	tvd->vdev_islog = svd->vdev_islog;
	svd->vdev_islog = 0;
	}

	static void
	vdev_top_update(vdev_t tvd, vdev_t vd)
	{
	if (vd == NULL)
	return;

	vd->vdev_top = tvd;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_top_update(tvd, vd->vdev_child[c]);
	}

	/*
	* Add a mirror/replacing vdev above an existing vdev.
	*/
	vdev_t *
	vdev_add_parent(vdev_t cvd, vdev_ops_t ops)
	{
	spa_t *spa = cvd->vdev_spa;
	vdev_t *pvd = cvd->vdev_parent;
	vdev_t *mvd;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);

	mvd->vdev_asize = cvd->vdev_asize;
	mvd->vdev_min_asize = cvd->vdev_min_asize;
	mvd->vdev_max_asize = cvd->vdev_max_asize;
	mvd->vdev_ashift = cvd->vdev_ashift;
	mvd->vdev_state = cvd->vdev_state;
	mvd->vdev_crtxg = cvd->vdev_crtxg;

	vdev_remove_child(pvd, cvd);
	vdev_add_child(pvd, mvd);
	cvd->vdev_id = mvd->vdev_children;
	vdev_add_child(mvd, cvd);
	vdev_top_update(cvd->vdev_top, cvd->vdev_top);

	if (mvd == mvd->vdev_top)
	vdev_top_transfer(cvd, mvd);

	return (mvd);
	}

	/*
	* Remove a 1-way mirror/replacing vdev from the tree.
	*/
	void
	vdev_remove_parent(vdev_t *cvd)
	{
	vdev_t *mvd = cvd->vdev_parent;
	vdev_t *pvd = mvd->vdev_parent;

	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	ASSERT(mvd->vdev_children == 1);
	ASSERT(mvd->vdev_ops == &vdev_mirror_ops \|\|
	mvd->vdev_ops == &vdev_replacing_ops \|\|
	mvd->vdev_ops == &vdev_spare_ops);
	cvd->vdev_ashift = mvd->vdev_ashift;

	vdev_remove_child(mvd, cvd);
	vdev_remove_child(pvd, mvd);

	/*
	* If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
	* Otherwise, we could have detached an offline device, and when we
	* go to import the pool we'll think we have two top-level vdevs,
	* instead of a different version of the same top-level vdev.
	*/
	if (mvd->vdev_top == mvd) {
	uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
	cvd->vdev_orig_guid = cvd->vdev_guid;
	cvd->vdev_guid += guid_delta;
	cvd->vdev_guid_sum += guid_delta;
	}
	cvd->vdev_id = mvd->vdev_id;
	vdev_add_child(pvd, cvd);
	vdev_top_update(cvd->vdev_top, cvd->vdev_top);

	if (cvd == cvd->vdev_top)
	vdev_top_transfer(mvd, cvd);

	ASSERT(mvd->vdev_children == 0);
	vdev_free(mvd);
	}

	int
	vdev_metaslab_init(vdev_t *vd, uint64_t txg)
	{
	spa_t *spa = vd->vdev_spa;
	objset_t *mos = spa->spa_meta_objset;
	uint64_t m;
	uint64_t oldc = vd->vdev_ms_count;
	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
	metaslab_t **mspp;
	int error;

	ASSERT(txg == 0 \|\| spa_config_held(spa, SCL_ALLOC, RW_WRITER));

	/*
	* This vdev is not being allocated from yet or is a hole.
	*/
	if (vd->vdev_ms_shift == 0)
	return (0);

	ASSERT(!vd->vdev_ishole);

	/*
	* Compute the raidz-deflation ratio. Note, we hard-code
	* in 128k (1 << 17) because it is the "typical" blocksize.
	* Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
	* otherwise it would inconsistently account for existing bp's.
	*/
	vd->vdev_deflate_ratio = (1 << 17) /
	(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);

	ASSERT(oldc <= newc);

	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);

	if (oldc != 0) {
	bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
	kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
	}

	vd->vdev_ms = mspp;
	vd->vdev_ms_count = newc;

	for (m = oldc; m < newc; m++) {
	uint64_t object = 0;

	if (txg == 0) {
	error = dmu_read(mos, vd->vdev_ms_array,
	m * sizeof (uint64_t), sizeof (uint64_t), &object,
	DMU_READ_PREFETCH);
	if (error)
	return (error);
	}

	error = metaslab_init(vd->vdev_mg, m, object, txg,
	&(vd->vdev_ms[m]));
	if (error)
	return (error);
	}

	if (txg == 0)
	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);

	/*
	* If the vdev is being removed we don't activate
	* the metaslabs since we want to ensure that no new
	* allocations are performed on this device.
	*/
	if (oldc == 0 && !vd->vdev_removing)
	metaslab_group_activate(vd->vdev_mg);

	if (txg == 0)
	spa_config_exit(spa, SCL_ALLOC, FTAG);

	return (0);
	}

	void
	vdev_metaslab_fini(vdev_t *vd)
	{
	uint64_t m;
	uint64_t count = vd->vdev_ms_count;

	if (vd->vdev_ms != NULL) {
	metaslab_group_passivate(vd->vdev_mg);
	for (m = 0; m < count; m++) {
	metaslab_t *msp = vd->vdev_ms[m];

	if (msp != NULL)
	metaslab_fini(msp);
	}
	kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
	vd->vdev_ms = NULL;
	}
	}

	typedef struct vdev_probe_stats {
	boolean_t vps_readable;
	boolean_t vps_writeable;
	int vps_flags;
	} vdev_probe_stats_t;

	static void
	vdev_probe_done(zio_t *zio)
	{
	spa_t *spa = zio->io_spa;
	vdev_t *vd = zio->io_vd;
	vdev_probe_stats_t *vps = zio->io_private;

	ASSERT(vd->vdev_probe_zio != NULL);

	if (zio->io_type == ZIO_TYPE_READ) {
	if (zio->io_error == 0)
	vps->vps_readable = 1;
	if (zio->io_error == 0 && spa_writeable(spa)) {
	zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
	zio->io_offset, zio->io_size, zio->io_data,
	ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
	ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
	} else {
	zio_buf_free(zio->io_data, zio->io_size);
	}
	} else if (zio->io_type == ZIO_TYPE_WRITE) {
	if (zio->io_error == 0)
	vps->vps_writeable = 1;
	zio_buf_free(zio->io_data, zio->io_size);
	} else if (zio->io_type == ZIO_TYPE_NULL) {
	zio_t *pio;

	vd->vdev_cant_read \|= !vps->vps_readable;
	vd->vdev_cant_write \|= !vps->vps_writeable;

	if (vdev_readable(vd) &&
	(vdev_writeable(vd) \|\| !spa_writeable(spa))) {
	zio->io_error = 0;
	} else {
	ASSERT(zio->io_error != 0);
	zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
	spa, vd, NULL, 0, 0);
	zio->io_error = SET_ERROR(ENXIO);
	}

	mutex_enter(&vd->vdev_probe_lock);
	ASSERT(vd->vdev_probe_zio == zio);
	vd->vdev_probe_zio = NULL;
	mutex_exit(&vd->vdev_probe_lock);

	while ((pio = zio_walk_parents(zio)) != NULL)
	if (!vdev_accessible(vd, pio))
	pio->io_error = SET_ERROR(ENXIO);

	kmem_free(vps, sizeof (*vps));
	}
	}

	/*
	* Determine whether this device is accessible.
	*
	* Read and write to several known locations: the pad regions of each
	* vdev label but the first, which we leave alone in case it contains
	* a VTOC.
	*/
	zio_t *
	vdev_probe(vdev_t vd, zio_t zio)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_probe_stats_t *vps = NULL;
	zio_t *pio;

	ASSERT(vd->vdev_ops->vdev_op_leaf);

	/*
	* Don't probe the probe.
	*/
	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
	return (NULL);

	/*
	* To prevent 'probe storms' when a device fails, we create
	* just one probe i/o at a time. All zios that want to probe
	* this vdev will become parents of the probe io.
	*/
	mutex_enter(&vd->vdev_probe_lock);

	if ((pio = vd->vdev_probe_zio) == NULL) {
	vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);

	vps->vps_flags = ZIO_FLAG_CANFAIL \| ZIO_FLAG_PROBE \|
	ZIO_FLAG_DONT_CACHE \| ZIO_FLAG_DONT_AGGREGATE \|
	ZIO_FLAG_TRYHARD;

	if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
	/*
	* vdev_cant_read and vdev_cant_write can only
	* transition from TRUE to FALSE when we have the
	* SCL_ZIO lock as writer; otherwise they can only
	* transition from FALSE to TRUE. This ensures that
	* any zio looking at these values can assume that
	* failures persist for the life of the I/O. That's
	* important because when a device has intermittent
	* connectivity problems, we want to ensure that
	* they're ascribed to the device (ENXIO) and not
	* the zio (EIO).
	*
	* Since we hold SCL_ZIO as writer here, clear both
	* values so the probe can reevaluate from first
	* principles.
	*/
	vps->vps_flags \|= ZIO_FLAG_CONFIG_WRITER;
	vd->vdev_cant_read = B_FALSE;
	vd->vdev_cant_write = B_FALSE;
	}

	vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
	vdev_probe_done, vps,
	vps->vps_flags \| ZIO_FLAG_DONT_PROPAGATE);

	/*
	* We can't change the vdev state in this context, so we
	* kick off an async task to do it on our behalf.
	*/
	if (zio != NULL) {
	vd->vdev_probe_wanted = B_TRUE;
	spa_async_request(spa, SPA_ASYNC_PROBE);
	}
	}

	if (zio != NULL)
	zio_add_child(zio, pio);

	mutex_exit(&vd->vdev_probe_lock);

	if (vps == NULL) {
	ASSERT(zio != NULL);
	return (NULL);
	}

	for (int l = 1; l < VDEV_LABELS; l++) {
	zio_nowait(zio_read_phys(pio, vd,
	vdev_label_offset(vd->vdev_psize, l,
	offsetof(vdev_label_t, vl_pad2)),
	VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
	ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
	ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
	}

	if (zio == NULL)
	return (pio);

	zio_nowait(pio);
	return (NULL);
	}

	static void
	vdev_open_child(void *arg)
	{
	vdev_t *vd = arg;

	vd->vdev_open_thread = curthread;
	vd->vdev_open_error = vdev_open(vd);
	vd->vdev_open_thread = NULL;
	}

	boolean_t
	vdev_uses_zvols(vdev_t *vd)
	{
	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
	strlen(ZVOL_DIR)) == 0)
	return (B_TRUE);
	for (int c = 0; c < vd->vdev_children; c++)
	if (vdev_uses_zvols(vd->vdev_child[c]))
	return (B_TRUE);
	return (B_FALSE);
	}

	void
	vdev_open_children(vdev_t *vd)
	{
	taskq_t *tq;
	int children = vd->vdev_children;

	/*
	* in order to handle pools on top of zvols, do the opens
	* in a single thread so that the same thread holds the
	* spa_namespace_lock
	*/
	if (vdev_uses_zvols(vd)) {
	for (int c = 0; c < children; c++)
	vd->vdev_child[c]->vdev_open_error =
	vdev_open(vd->vdev_child[c]);
	return;
	}
	tq = taskq_create("vdev_open", children, minclsyspri,
	children, children, TASKQ_PREPOPULATE);

	for (int c = 0; c < children; c++)
	VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
	TQ_SLEEP) != NULL);

	taskq_destroy(tq);
	}

	/*
	* Prepare a virtual device for access.
	*/
	int
	vdev_open(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	int error;
	uint64_t osize = 0;
	uint64_t max_osize = 0;
	uint64_t asize, max_asize, psize;
	uint64_t ashift = 0;

	ASSERT(vd->vdev_open_thread == curthread \|\|
	spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED \|\|
	vd->vdev_state == VDEV_STATE_CANT_OPEN \|\|
	vd->vdev_state == VDEV_STATE_OFFLINE);

	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	vd->vdev_cant_read = B_FALSE;
	vd->vdev_cant_write = B_FALSE;
	vd->vdev_min_asize = vdev_get_min_asize(vd);

	/*
	* If this vdev is not removed, check its fault status. If it's
	* faulted, bail out of the open.
	*/
	if (!vd->vdev_removed && vd->vdev_faulted) {
	ASSERT(vd->vdev_children == 0);
	ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\|
	vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
	vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	vd->vdev_label_aux);
	return (SET_ERROR(ENXIO));
	} else if (vd->vdev_offline) {
	ASSERT(vd->vdev_children == 0);
	vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
	return (SET_ERROR(ENXIO));
	}

	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);

	/*
	* Reset the vdev_reopening flag so that we actually close
	* the vdev on error.
	*/
	vd->vdev_reopening = B_FALSE;
	if (zio_injection_enabled && error == 0)
	error = zio_handle_device_injection(vd, NULL, ENXIO);

	if (error) {
	if (vd->vdev_removed &&
	vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
	vd->vdev_removed = B_FALSE;

	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	vd->vdev_stat.vs_aux);
	return (error);
	}

	vd->vdev_removed = B_FALSE;

	/*
	* Recheck the faulted flag now that we have confirmed that
	* the vdev is accessible. If we're faulted, bail.
	*/
	if (vd->vdev_faulted) {
	ASSERT(vd->vdev_children == 0);
	ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED \|\|
	vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
	vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	vd->vdev_label_aux);
	return (SET_ERROR(ENXIO));
	}

	if (vd->vdev_degraded) {
	ASSERT(vd->vdev_children == 0);
	vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
	VDEV_AUX_ERR_EXCEEDED);
	} else {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
	}

	/*
	* For hole or missing vdevs we just return success.
	*/
	if (vd->vdev_ishole \|\| vd->vdev_ops == &vdev_missing_ops)
	return (0);

	for (int c = 0; c < vd->vdev_children; c++) {
	if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
	VDEV_AUX_NONE);
	break;
	}
	}

	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));

	if (vd->vdev_children == 0) {
	if (osize < SPA_MINDEVSIZE) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_TOO_SMALL);
	return (SET_ERROR(EOVERFLOW));
	}
	psize = osize;
	asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
	max_asize = max_osize - (VDEV_LABEL_START_SIZE +
	VDEV_LABEL_END_SIZE);
	} else {
	if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
	(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_TOO_SMALL);
	return (SET_ERROR(EOVERFLOW));
	}
	psize = 0;
	asize = osize;
	max_asize = max_osize;
	}

	vd->vdev_psize = psize;

	/*
	* Make sure the allocatable size hasn't shrunk.
	*/
	if (asize < vd->vdev_min_asize) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_BAD_LABEL);
	return (SET_ERROR(EINVAL));
	}

	if (vd->vdev_asize == 0) {
	/*
	* This is the first-ever open, so use the computed values.
	* For testing purposes, a higher ashift can be requested.
	*/
	vd->vdev_asize = asize;
	vd->vdev_max_asize = max_asize;
	vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
	} else {
	/*
	* Detect if the alignment requirement has increased.
	* We don't want to make the pool unavailable, just
	* issue a warning instead.
	*/
	if (ashift > vd->vdev_top->vdev_ashift &&
	vd->vdev_ops->vdev_op_leaf) {
	cmn_err(CE_WARN,
	"Disk, '%s', has a block alignment that is "
	"larger than the pool's alignment\n",
	vd->vdev_path);
	}
	vd->vdev_max_asize = max_asize;
	}

	/*
	* If all children are healthy and the asize has increased,
	* then we've experienced dynamic LUN growth. If automatic
	* expansion is enabled then use the additional space.
	*/
	if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
	(vd->vdev_expanding \|\| spa->spa_autoexpand))
	vd->vdev_asize = asize;

	vdev_set_min_asize(vd);

	/*
	* Ensure we can issue some IO before declaring the
	* vdev open for business.
	*/
	if (vd->vdev_ops->vdev_op_leaf &&
	(error = zio_wait(vdev_probe(vd, NULL))) != 0) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
	VDEV_AUX_ERR_EXCEEDED);
	return (error);
	}

	/*
	* Track the min and max ashift values for normal data devices.
	*/
	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
	!vd->vdev_islog && vd->vdev_aux == NULL) {
	if (vd->vdev_ashift > spa->spa_max_ashift)
	spa->spa_max_ashift = vd->vdev_ashift;
	if (vd->vdev_ashift < spa->spa_min_ashift)
	spa->spa_min_ashift = vd->vdev_ashift;
	}

	/*
	* If a leaf vdev has a DTL, and seems healthy, then kick off a
	* resilver. But don't do this if we are doing a reopen for a scrub,
	* since this would just restart the scrub we are already doing.
	*/
	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
	vdev_resilver_needed(vd, NULL, NULL))
	spa_async_request(spa, SPA_ASYNC_RESILVER);

	return (0);
	}

	/*
	* Called once the vdevs are all opened, this routine validates the label
	* contents. This needs to be done before vdev_load() so that we don't
	* inadvertently do repair I/Os to the wrong device.
	*
	* If 'strict' is false ignore the spa guid check. This is necessary because
	* if the machine crashed during a re-guid the new guid might have been written
	* to all of the vdev labels, but not the cached config. The strict check
	* will be performed when the pool is opened again using the mos config.
	*
	* This function will only return failure if one of the vdevs indicates that it
	* has since been destroyed or exported. This is only possible if
	* /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
	* will be updated but the function will return 0.
	*/
	int
	vdev_validate(vdev_t *vd, boolean_t strict)
	{
	spa_t *spa = vd->vdev_spa;
	nvlist_t *label;
	uint64_t guid = 0, top_guid;
	uint64_t state;

	for (int c = 0; c < vd->vdev_children; c++)
	if (vdev_validate(vd->vdev_child[c], strict) != 0)
	return (SET_ERROR(EBADF));

	/*
	* If the device has already failed, or was marked offline, don't do
	* any further validation. Otherwise, label I/O will fail and we will
	* overwrite the previous state.
	*/
	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
	uint64_t aux_guid = 0;
	nvlist_t *nvl;
	uint64_t txg = spa_last_synced_txg(spa) != 0 ?
	spa_last_synced_txg(spa) : -1ULL;

	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_BAD_LABEL);
	return (0);
	}

	/*
	* Determine if this vdev has been split off into another
	* pool. If so, then refuse to open it.
	*/
	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
	&aux_guid) == 0 && aux_guid == spa_guid(spa)) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_SPLIT_POOL);
	nvlist_free(label);
	return (0);
	}

	if (strict && (nvlist_lookup_uint64(label,
	ZPOOL_CONFIG_POOL_GUID, &guid) != 0 \|\|
	guid != spa_guid(spa))) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	nvlist_free(label);
	return (0);
	}

	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
	!= 0 \|\| nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
	&aux_guid) != 0)
	aux_guid = 0;

	/*
	* If this vdev just became a top-level vdev because its
	* sibling was detached, it will have adopted the parent's
	* vdev guid -- but the label may or may not be on disk yet.
	* Fortunately, either version of the label will have the
	* same top guid, so if we're a top-level vdev, we can
	* safely compare to that instead.
	*
	* If we split this vdev off instead, then we also check the
	* original pool's guid. We don't want to consider the vdev
	* corrupt if it is partway through a split operation.
	*/
	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
	&guid) != 0 \|\|
	nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
	&top_guid) != 0 \|\|
	((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
	(vd->vdev_guid != top_guid \|\| vd != vd->vdev_top))) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	nvlist_free(label);
	return (0);
	}

	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
	&state) != 0) {
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	nvlist_free(label);
	return (0);
	}

	nvlist_free(label);

	/*
	* If this is a verbatim import, no need to check the
	* state of the pool.
	*/
	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
	spa_load_state(spa) == SPA_LOAD_OPEN &&
	state != POOL_STATE_ACTIVE)
	return (SET_ERROR(EBADF));

	/*
	* If we were able to open and validate a vdev that was
	* previously marked permanently unavailable, clear that state
	* now.
	*/
	if (vd->vdev_not_present)
	vd->vdev_not_present = 0;
	}

	return (0);
	}

	/*
	* Close a virtual device.
	*/
	void
	vdev_close(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *pvd = vd->vdev_parent;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	/*
	* If our parent is reopening, then we are as well, unless we are
	* going offline.
	*/
	if (pvd != NULL && pvd->vdev_reopening)
	vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);

	vd->vdev_ops->vdev_op_close(vd);

	vdev_cache_purge(vd);

	/*
	* We record the previous state before we close it, so that if we are
	* doing a reopen(), we don't generate FMA ereports if we notice that
	* it's still faulted.
	*/
	vd->vdev_prevstate = vd->vdev_state;

	if (vd->vdev_offline)
	vd->vdev_state = VDEV_STATE_OFFLINE;
	else
	vd->vdev_state = VDEV_STATE_CLOSED;
	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	}

	void
	vdev_hold(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_is_root(spa));
	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
	return;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_hold(vd->vdev_child[c]);

	if (vd->vdev_ops->vdev_op_leaf)
	vd->vdev_ops->vdev_op_hold(vd);
	}

	void
	vdev_rele(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_is_root(spa));
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_rele(vd->vdev_child[c]);

	if (vd->vdev_ops->vdev_op_leaf)
	vd->vdev_ops->vdev_op_rele(vd);
	}

	/*
	* Reopen all interior vdevs and any unopened leaves. We don't actually
	* reopen leaf vdevs which had previously been opened as they might deadlock
	* on the spa_config_lock. Instead we only obtain the leaf's physical size.
	* If the leaf has never been opened then open it, as usual.
	*/
	void
	vdev_reopen(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	/* set the reopening flag unless we're taking the vdev offline */
	vd->vdev_reopening = !vd->vdev_offline;
	vdev_close(vd);
	(void) vdev_open(vd);

	/*
	* Call vdev_validate() here to make sure we have the same device.
	* Otherwise, a device with an invalid label could be successfully
	* opened in response to vdev_reopen().
	*/
	if (vd->vdev_aux) {
	(void) vdev_validate_aux(vd);
	if (vdev_readable(vd) && vdev_writeable(vd) &&
	vd->vdev_aux == &spa->spa_l2cache &&
	!l2arc_vdev_present(vd))
	l2arc_add_vdev(spa, vd);
	} else {
	(void) vdev_validate(vd, B_TRUE);
	}

	/*
	* Reassess parent vdev's health.
	*/
	vdev_propagate_state(vd);
	}

	int
	vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
	{
	int error;

	/*
	* Normally, partial opens (e.g. of a mirror) are allowed.
	* For a create, however, we want to fail the request if
	* there are any components we can't open.
	*/
	error = vdev_open(vd);

	if (error \|\| vd->vdev_state != VDEV_STATE_HEALTHY) {
	vdev_close(vd);
	return (error ? error : ENXIO);
	}

	/*
	* Recursively load DTLs and initialize all labels.
	*/
	if ((error = vdev_dtl_load(vd)) != 0 \|\|
	(error = vdev_label_init(vd, txg, isreplacing ?
	VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
	vdev_close(vd);
	return (error);
	}

	return (0);
	}

	void
	vdev_metaslab_set_size(vdev_t *vd)
	{
	/*
	* Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
	*/
	vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
	}

	void
	vdev_dirty(vdev_t vd, int flags, void arg, uint64_t txg)
	{
	ASSERT(vd == vd->vdev_top);
	ASSERT(!vd->vdev_ishole);
	ASSERT(ISP2(flags));
	ASSERT(spa_writeable(vd->vdev_spa));

	if (flags & VDD_METASLAB)
	(void) txg_list_add(&vd->vdev_ms_list, arg, txg);

	if (flags & VDD_DTL)
	(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);

	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
	}

	void
	vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
	{
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_dirty_leaves(vd->vdev_child[c], flags, txg);

	if (vd->vdev_ops->vdev_op_leaf)
	vdev_dirty(vd->vdev_top, flags, vd, txg);
	}

	/*
	* DTLs.
	*
	* A vdev's DTL (dirty time log) is the set of transaction groups for which
	* the vdev has less than perfect replication. There are four kinds of DTL:
	*
	* DTL_MISSING: txgs for which the vdev has no valid copies of the data
	*
	* DTL_PARTIAL: txgs for which data is available, but not fully replicated
	*
	* DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
	* scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
	* txgs that was scrubbed.
	*
	* DTL_OUTAGE: txgs which cannot currently be read, whether due to
	* persistent errors or just some device being offline.
	* Unlike the other three, the DTL_OUTAGE map is not generally
	* maintained; it's only computed when needed, typically to
	* determine whether a device can be detached.
	*
	* For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
	* either has the data or it doesn't.
	*
	* For interior vdevs such as mirror and RAID-Z the picture is more complex.
	* A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
	* if any child is less than fully replicated, then so is its parent.
	* A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
	* comprising only those txgs which appear in 'maxfaults' or more children;
	* those are the txgs we don't have enough replication to read. For example,
	* double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
	* thus, its DTL_MISSING consists of the set of txgs that appear in more than
	* two child DTL_MISSING maps.
	*
	* It should be clear from the above that to compute the DTLs and outage maps
	* for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
	* Therefore, that is all we keep on disk. When loading the pool, or after
	* a configuration change, we generate all other DTLs from first principles.
	*/
	void
	vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
	{
	range_tree_t *rt = vd->vdev_dtl[t];

	ASSERT(t < DTL_TYPES);
	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
	ASSERT(spa_writeable(vd->vdev_spa));

	mutex_enter(rt->rt_lock);
	if (!range_tree_contains(rt, txg, size))
	range_tree_add(rt, txg, size);
	mutex_exit(rt->rt_lock);
	}

	boolean_t
	vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
	{
	range_tree_t *rt = vd->vdev_dtl[t];
	boolean_t dirty = B_FALSE;

	ASSERT(t < DTL_TYPES);
	ASSERT(vd != vd->vdev_spa->spa_root_vdev);

	mutex_enter(rt->rt_lock);
	if (range_tree_space(rt) != 0)
	dirty = range_tree_contains(rt, txg, size);
	mutex_exit(rt->rt_lock);

	return (dirty);
	}

	boolean_t
	vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
	{
	range_tree_t *rt = vd->vdev_dtl[t];
	boolean_t empty;

	mutex_enter(rt->rt_lock);
	empty = (range_tree_space(rt) == 0);
	mutex_exit(rt->rt_lock);

	return (empty);
	}

	/*
	* Returns the lowest txg in the DTL range.
	*/
	static uint64_t
	vdev_dtl_min(vdev_t *vd)
	{
	range_seg_t *rs;

	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
	ASSERT0(vd->vdev_children);

	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
	return (rs->rs_start - 1);
	}

	/*
	* Returns the highest txg in the DTL.
	*/
	static uint64_t
	vdev_dtl_max(vdev_t *vd)
	{
	range_seg_t *rs;

	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
	ASSERT0(vd->vdev_children);

	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
	return (rs->rs_end);
	}

	/*
	* Determine if a resilvering vdev should remove any DTL entries from
	* its range. If the vdev was resilvering for the entire duration of the
	* scan then it should excise that range from its DTLs. Otherwise, this
	* vdev is considered partially resilvered and should leave its DTL
	* entries intact. The comment in vdev_dtl_reassess() describes how we
	* excise the DTLs.
	*/
	static boolean_t
	vdev_dtl_should_excise(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;

	ASSERT0(scn->scn_phys.scn_errors);
	ASSERT0(vd->vdev_children);

	if (vd->vdev_resilver_txg == 0 \|\|
	range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
	return (B_TRUE);

	/*
	* When a resilver is initiated the scan will assign the scn_max_txg
	* value to the highest txg value that exists in all DTLs. If this
	* device's max DTL is not part of this scan (i.e. it is not in
	* the range (scn_min_txg, scn_max_txg] then it is not eligible
	* for excision.
	*/
	if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
	ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
	ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
	ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	/*
	* Reassess DTLs after a config change or scrub completion.
	*/
	void
	vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
	{
	spa_t *spa = vd->vdev_spa;
	avl_tree_t reftree;
	int minref;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_dtl_reassess(vd->vdev_child[c], txg,
	scrub_txg, scrub_done);

	if (vd == spa->spa_root_vdev \|\| vd->vdev_ishole \|\| vd->vdev_aux)
	return;

	if (vd->vdev_ops->vdev_op_leaf) {
	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;

	mutex_enter(&vd->vdev_dtl_lock);

	/*
	* If we've completed a scan cleanly then determine
	* if this vdev should remove any DTLs. We only want to
	* excise regions on vdevs that were available during
	* the entire duration of this scan.
	*/
	if (scrub_txg != 0 &&
	(spa->spa_scrub_started \|\|
	(scn != NULL && scn->scn_phys.scn_errors == 0)) &&
	vdev_dtl_should_excise(vd)) {
	/*
	* We completed a scrub up to scrub_txg. If we
	* did it without rebooting, then the scrub dtl
	* will be valid, so excise the old region and
	* fold in the scrub dtl. Otherwise, leave the
	* dtl as-is if there was an error.
	*
	* There's little trick here: to excise the beginning
	* of the DTL_MISSING map, we put it into a reference
	* tree and then add a segment with refcnt -1 that
	* covers the range [0, scrub_txg). This means
	* that each txg in that range has refcnt -1 or 0.
	* We then add DTL_SCRUB with a refcnt of 2, so that
	* entries in the range [0, scrub_txg) will have a
	* positive refcnt -- either 1 or 2. We then convert
	* the reference tree into the new DTL_MISSING map.
	*/
	space_reftree_create(&reftree);
	space_reftree_add_map(&reftree,
	vd->vdev_dtl[DTL_MISSING], 1);
	space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
	space_reftree_add_map(&reftree,
	vd->vdev_dtl[DTL_SCRUB], 2);
	space_reftree_generate_map(&reftree,
	vd->vdev_dtl[DTL_MISSING], 1);
	space_reftree_destroy(&reftree);
	}
	range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
	range_tree_walk(vd->vdev_dtl[DTL_MISSING],
	range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
	if (scrub_done)
	range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
	range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
	if (!vdev_readable(vd))
	range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
	else
	range_tree_walk(vd->vdev_dtl[DTL_MISSING],
	range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);

	/*
	* If the vdev was resilvering and no longer has any
	* DTLs then reset its resilvering flag.
	*/
	if (vd->vdev_resilver_txg != 0 &&
	range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
	range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0)
	vd->vdev_resilver_txg = 0;

	mutex_exit(&vd->vdev_dtl_lock);

	if (txg != 0)
	vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
	return;
	}

	mutex_enter(&vd->vdev_dtl_lock);
	for (int t = 0; t < DTL_TYPES; t++) {
	/* account for child's outage in parent's missing map */
	int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
	if (t == DTL_SCRUB)
	continue; /* leaf vdevs only */
	if (t == DTL_PARTIAL)
	minref = 1; /* i.e. non-zero */
	else if (vd->vdev_nparity != 0)
	minref = vd->vdev_nparity + 1; /* RAID-Z */
	else
	minref = vd->vdev_children; /* any kind of mirror */
	space_reftree_create(&reftree);
	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];
	mutex_enter(&cvd->vdev_dtl_lock);
	space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
	mutex_exit(&cvd->vdev_dtl_lock);
	}
	space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
	space_reftree_destroy(&reftree);
	}
	mutex_exit(&vd->vdev_dtl_lock);
	}

	int
	vdev_dtl_load(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	objset_t *mos = spa->spa_meta_objset;
	int error = 0;

	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
	ASSERT(!vd->vdev_ishole);

	error = space_map_open(&vd->vdev_dtl_sm, mos,
	vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
	if (error)
	return (error);
	ASSERT(vd->vdev_dtl_sm != NULL);

	mutex_enter(&vd->vdev_dtl_lock);

	/*
	* Now that we've opened the space_map we need to update
	* the in-core DTL.
	*/
	space_map_update(vd->vdev_dtl_sm);

	error = space_map_load(vd->vdev_dtl_sm,
	vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
	mutex_exit(&vd->vdev_dtl_lock);

	return (error);
	}

	for (int c = 0; c < vd->vdev_children; c++) {
	error = vdev_dtl_load(vd->vdev_child[c]);
	if (error != 0)
	break;
	}

	return (error);
	}

	void
	vdev_dtl_sync(vdev_t *vd, uint64_t txg)
	{
	spa_t *spa = vd->vdev_spa;
	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
	objset_t *mos = spa->spa_meta_objset;
	range_tree_t *rtsync;
	kmutex_t rtlock;
	dmu_tx_t *tx;
	uint64_t object = space_map_object(vd->vdev_dtl_sm);

	ASSERT(!vd->vdev_ishole);
	ASSERT(vd->vdev_ops->vdev_op_leaf);

	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);

	if (vd->vdev_detached \|\| vd->vdev_top->vdev_removing) {
	mutex_enter(&vd->vdev_dtl_lock);
	space_map_free(vd->vdev_dtl_sm, tx);
	space_map_close(vd->vdev_dtl_sm);
	vd->vdev_dtl_sm = NULL;
	mutex_exit(&vd->vdev_dtl_lock);
	dmu_tx_commit(tx);
	return;
	}

	if (vd->vdev_dtl_sm == NULL) {
	uint64_t new_object;

	new_object = space_map_alloc(mos, tx);
	VERIFY3U(new_object, !=, 0);

	VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
	0, -1ULL, 0, &vd->vdev_dtl_lock));
	ASSERT(vd->vdev_dtl_sm != NULL);
	}

	mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);

	rtsync = range_tree_create(NULL, NULL, &rtlock);

	mutex_enter(&rtlock);

	mutex_enter(&vd->vdev_dtl_lock);
	range_tree_walk(rt, range_tree_add, rtsync);
	mutex_exit(&vd->vdev_dtl_lock);

	space_map_truncate(vd->vdev_dtl_sm, tx);
	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
	range_tree_vacate(rtsync, NULL, NULL);

	range_tree_destroy(rtsync);

	mutex_exit(&rtlock);
	mutex_destroy(&rtlock);

	/*
	* If the object for the space map has changed then dirty
	* the top level so that we update the config.
	*/
	if (object != space_map_object(vd->vdev_dtl_sm)) {
	zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
	"new object %llu", txg, spa_name(spa), object,
	space_map_object(vd->vdev_dtl_sm));
	vdev_config_dirty(vd->vdev_top);
	}

	dmu_tx_commit(tx);

	mutex_enter(&vd->vdev_dtl_lock);
	space_map_update(vd->vdev_dtl_sm);
	mutex_exit(&vd->vdev_dtl_lock);
	}

	/*
	* Determine whether the specified vdev can be offlined/detached/removed
	* without losing data.
	*/
	boolean_t
	vdev_dtl_required(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *tvd = vd->vdev_top;
	uint8_t cant_read = vd->vdev_cant_read;
	boolean_t required;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	if (vd == spa->spa_root_vdev \|\| vd == tvd)
	return (B_TRUE);

	/*
	* Temporarily mark the device as unreadable, and then determine
	* whether this results in any DTL outages in the top-level vdev.
	* If not, we can safely offline/detach/remove the device.
	*/
	vd->vdev_cant_read = B_TRUE;
	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
	vd->vdev_cant_read = cant_read;
	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);

	if (!required && zio_injection_enabled)
	required = !!zio_handle_device_injection(vd, NULL, ECHILD);

	return (required);
	}

	/*
	* Determine if resilver is needed, and if so the txg range.
	*/
	boolean_t
	vdev_resilver_needed(vdev_t vd, uint64_t minp, uint64_t *maxp)
	{
	boolean_t needed = B_FALSE;
	uint64_t thismin = UINT64_MAX;
	uint64_t thismax = 0;

	if (vd->vdev_children == 0) {
	mutex_enter(&vd->vdev_dtl_lock);
	if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
	vdev_writeable(vd)) {

	thismin = vdev_dtl_min(vd);
	thismax = vdev_dtl_max(vd);
	needed = B_TRUE;
	}
	mutex_exit(&vd->vdev_dtl_lock);
	} else {
	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];
	uint64_t cmin, cmax;

	if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
	thismin = MIN(thismin, cmin);
	thismax = MAX(thismax, cmax);
	needed = B_TRUE;
	}
	}
	}

	if (needed && minp) {
	*minp = thismin;
	*maxp = thismax;
	}
	return (needed);
	}

	void
	vdev_load(vdev_t *vd)
	{
	/*
	* Recursively load all children.
	*/
	for (int c = 0; c < vd->vdev_children; c++)
	vdev_load(vd->vdev_child[c]);

	/*
	* If this is a top-level vdev, initialize its metaslabs.
	*/
	if (vd == vd->vdev_top && !vd->vdev_ishole &&
	(vd->vdev_ashift == 0 \|\| vd->vdev_asize == 0 \|\|
	vdev_metaslab_init(vd, 0) != 0))
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);

	/*
	* If this is a leaf vdev, load its DTL.
	*/
	if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
	vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	}

	/*
	* The special vdev case is used for hot spares and l2cache devices. Its
	* sole purpose it to set the vdev state for the associated vdev. To do this,
	* we make sure that we can open the underlying device, then try to read the
	* label, and make sure that the label is sane and that it hasn't been
	* repurposed to another pool.
	*/
	int
	vdev_validate_aux(vdev_t *vd)
	{
	nvlist_t *label;
	uint64_t guid, version;
	uint64_t state;

	if (!vdev_readable(vd))
	return (0);

	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	return (-1);
	}

	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 \|\|
	!SPA_VERSION_IS_SUPPORTED(version) \|\|
	nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 \|\|
	guid != vd->vdev_guid \|\|
	nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
	vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	nvlist_free(label);
	return (-1);
	}

	/*
	* We don't actually check the pool state here. If it's in fact in
	* use by another pool, we update this fact on the fly when requested.
	*/
	nvlist_free(label);
	return (0);
	}

	void
	vdev_remove(vdev_t *vd, uint64_t txg)
	{
	spa_t *spa = vd->vdev_spa;
	objset_t *mos = spa->spa_meta_objset;
	dmu_tx_t *tx;

	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);

	if (vd->vdev_ms != NULL) {
	metaslab_group_t *mg = vd->vdev_mg;

	metaslab_group_histogram_verify(mg);
	metaslab_class_histogram_verify(mg->mg_class);

	for (int m = 0; m < vd->vdev_ms_count; m++) {
	metaslab_t *msp = vd->vdev_ms[m];

	if (msp == NULL \|\| msp->ms_sm == NULL)
	continue;

	mutex_enter(&msp->ms_lock);
	/*
	* If the metaslab was not loaded when the vdev
	* was removed then the histogram accounting may
	* not be accurate. Update the histogram information
	* here so that we ensure that the metaslab group
	* and metaslab class are up-to-date.
	*/
	metaslab_group_histogram_remove(mg, msp);

	VERIFY0(space_map_allocated(msp->ms_sm));
	space_map_free(msp->ms_sm, tx);
	space_map_close(msp->ms_sm);
	msp->ms_sm = NULL;
	mutex_exit(&msp->ms_lock);
	}

	metaslab_group_histogram_verify(mg);
	metaslab_class_histogram_verify(mg->mg_class);
	for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
	ASSERT0(mg->mg_histogram[i]);

	}

	if (vd->vdev_ms_array) {
	(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
	vd->vdev_ms_array = 0;
	}
	dmu_tx_commit(tx);
	}

	void
	vdev_sync_done(vdev_t *vd, uint64_t txg)
	{
	metaslab_t *msp;
	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));

	ASSERT(!vd->vdev_ishole);

	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
	metaslab_sync_done(msp, txg);

	if (reassess)
	metaslab_sync_reassess(vd->vdev_mg);
	}

	void
	vdev_sync(vdev_t *vd, uint64_t txg)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *lvd;
	metaslab_t *msp;
	dmu_tx_t *tx;

	ASSERT(!vd->vdev_ishole);

	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
	ASSERT(vd == vd->vdev_top);
	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
	DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
	ASSERT(vd->vdev_ms_array != 0);
	vdev_config_dirty(vd);
	dmu_tx_commit(tx);
	}

	/*
	* Remove the metadata associated with this vdev once it's empty.
	*/
	if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
	vdev_remove(vd, txg);

	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
	metaslab_sync(msp, txg);
	(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
	}

	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
	vdev_dtl_sync(lvd, txg);

	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
	}

	uint64_t
	vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
	{
	return (vd->vdev_ops->vdev_op_asize(vd, psize));
	}

	/*
	* Mark the given vdev faulted. A faulted vdev behaves as if the device could
	* not be opened, and no I/O is attempted.
	*/
	int
	vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
	{
	vdev_t vd, tvd;

	spa_vdev_state_enter(spa, SCL_NONE);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	tvd = vd->vdev_top;

	/*
	* We don't directly use the aux state here, but if we do a
	* vdev_reopen(), we need this value to be present to remember why we
	* were faulted.
	*/
	vd->vdev_label_aux = aux;

	/*
	* Faulted state takes precedence over degraded.
	*/
	vd->vdev_delayed_close = B_FALSE;
	vd->vdev_faulted = 1ULL;
	vd->vdev_degraded = 0ULL;
	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);

	/*
	* If this device has the only valid copy of the data, then
	* back off and simply mark the vdev as degraded instead.
	*/
	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
	vd->vdev_degraded = 1ULL;
	vd->vdev_faulted = 0ULL;

	/*
	* If we reopen the device and it's not dead, only then do we
	* mark it degraded.
	*/
	vdev_reopen(tvd);

	if (vdev_readable(vd))
	vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
	}

	return (spa_vdev_state_exit(spa, vd, 0));
	}

	/*
	* Mark the given vdev degraded. A degraded vdev is purely an indication to the
	* user that something is wrong. The vdev continues to operate as normal as far
	* as I/O is concerned.
	*/
	int
	vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
	{
	vdev_t *vd;

	spa_vdev_state_enter(spa, SCL_NONE);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	/*
	* If the vdev is already faulted, then don't do anything.
	*/
	if (vd->vdev_faulted \|\| vd->vdev_degraded)
	return (spa_vdev_state_exit(spa, NULL, 0));

	vd->vdev_degraded = 1ULL;
	if (!vdev_is_dead(vd))
	vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
	aux);

	return (spa_vdev_state_exit(spa, vd, 0));
	}

	/*
	* Online the given vdev.
	*
	* If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
	* spare device should be detached when the device finishes resilvering.
	* Second, the online should be treated like a 'test' online case, so no FMA
	* events are generated if the device fails to open.
	*/
	int
	vdev_online(spa_t spa, uint64_t guid, uint64_t flags, vdev_state_t newstate)
	{
	vdev_t vd, tvd, pvd, rvd = spa->spa_root_vdev;

	spa_vdev_state_enter(spa, SCL_NONE);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	tvd = vd->vdev_top;
	vd->vdev_offline = B_FALSE;
	vd->vdev_tmpoffline = B_FALSE;
	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);

	/* XXX - L2ARC 1.0 does not support expansion */
	if (!vd->vdev_aux) {
	for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
	}

	vdev_reopen(tvd);
	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;

	if (!vd->vdev_aux) {
	for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	pvd->vdev_expanding = B_FALSE;
	}

	if (newstate)
	*newstate = vd->vdev_state;
	if ((flags & ZFS_ONLINE_UNSPARE) &&
	!vdev_is_dead(vd) && vd->vdev_parent &&
	vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	vd->vdev_parent->vdev_child[0] == vd)
	vd->vdev_unspare = B_TRUE;

	if ((flags & ZFS_ONLINE_EXPAND) \|\| spa->spa_autoexpand) {

	/* XXX - L2ARC 1.0 does not support expansion */
	if (vd->vdev_aux)
	return (spa_vdev_state_exit(spa, vd, ENOTSUP));
	spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
	}
	return (spa_vdev_state_exit(spa, vd, 0));
	}

	static int
	vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
	{
	vdev_t vd, tvd;
	int error = 0;
	uint64_t generation;
	metaslab_group_t *mg;

	top:
	spa_vdev_state_enter(spa, SCL_ALLOC);

	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));

	tvd = vd->vdev_top;
	mg = tvd->vdev_mg;
	generation = spa->spa_config_generation + 1;

	/*
	* If the device isn't already offline, try to offline it.
	*/
	if (!vd->vdev_offline) {
	/*
	* If this device has the only valid copy of some data,
	* don't allow it to be offlined. Log devices are always
	* expendable.
	*/
	if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
	vdev_dtl_required(vd))
	return (spa_vdev_state_exit(spa, NULL, EBUSY));

	/*
	* If the top-level is a slog and it has had allocations
	* then proceed. We check that the vdev's metaslab group
	* is not NULL since it's possible that we may have just
	* added this vdev but not yet initialized its metaslabs.
	*/
	if (tvd->vdev_islog && mg != NULL) {
	/*
	* Prevent any future allocations.
	*/
	metaslab_group_passivate(mg);
	(void) spa_vdev_state_exit(spa, vd, 0);

	error = spa_offline_log(spa);

	spa_vdev_state_enter(spa, SCL_ALLOC);

	/*
	* Check to see if the config has changed.
	*/
	if (error \|\| generation != spa->spa_config_generation) {
	metaslab_group_activate(mg);
	if (error)
	return (spa_vdev_state_exit(spa,
	vd, error));
	(void) spa_vdev_state_exit(spa, vd, 0);
	goto top;
	}
	ASSERT0(tvd->vdev_stat.vs_alloc);
	}

	/*
	* Offline this device and reopen its top-level vdev.
	* If the top-level vdev is a log device then just offline
	* it. Otherwise, if this action results in the top-level
	* vdev becoming unusable, undo it and fail the request.
	*/
	vd->vdev_offline = B_TRUE;
	vdev_reopen(tvd);

	if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
	vdev_is_dead(tvd)) {
	vd->vdev_offline = B_FALSE;
	vdev_reopen(tvd);
	return (spa_vdev_state_exit(spa, NULL, EBUSY));
	}

	/*
	* Add the device back into the metaslab rotor so that
	* once we online the device it's open for business.
	*/
	if (tvd->vdev_islog && mg != NULL)
	metaslab_group_activate(mg);
	}

	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);

	return (spa_vdev_state_exit(spa, vd, 0));
	}

	int
	vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
	{
	int error;

	mutex_enter(&spa->spa_vdev_top_lock);
	error = vdev_offline_locked(spa, guid, flags);
	mutex_exit(&spa->spa_vdev_top_lock);

	return (error);
	}

	/*
	* Clear the error counts associated with this vdev. Unlike vdev_online() and
	* vdev_offline(), we assume the spa config is locked. We also clear all
	* children. If 'vd' is NULL, then the user wants to clear all vdevs.
	*/
	void
	vdev_clear(spa_t spa, vdev_t vd)
	{
	vdev_t *rvd = spa->spa_root_vdev;

	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);

	if (vd == NULL)
	vd = rvd;

	vd->vdev_stat.vs_read_errors = 0;
	vd->vdev_stat.vs_write_errors = 0;
	vd->vdev_stat.vs_checksum_errors = 0;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_clear(spa, vd->vdev_child[c]);

	/*
	* If we're in the FAULTED state or have experienced failed I/O, then
	* clear the persistent state and attempt to reopen the device. We
	* also mark the vdev config dirty, so that the new faulted state is
	* written out to disk.
	*/
	if (vd->vdev_faulted \|\| vd->vdev_degraded \|\|
	!vdev_readable(vd) \|\| !vdev_writeable(vd)) {

	/*
	* When reopening in reponse to a clear event, it may be due to
	* a fmadm repair request. In this case, if the device is
	* still broken, we want to still post the ereport again.
	*/
	vd->vdev_forcefault = B_TRUE;

	vd->vdev_faulted = vd->vdev_degraded = 0ULL;
	vd->vdev_cant_read = B_FALSE;
	vd->vdev_cant_write = B_FALSE;

	vdev_reopen(vd == rvd ? rvd : vd->vdev_top);

	vd->vdev_forcefault = B_FALSE;

	if (vd != rvd && vdev_writeable(vd->vdev_top))
	vdev_state_dirty(vd->vdev_top);

	if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
	spa_async_request(spa, SPA_ASYNC_RESILVER);

	spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
	}

	/*
	* When clearing a FMA-diagnosed fault, we always want to
	* unspare the device, as we assume that the original spare was
	* done in response to the FMA fault.
	*/
	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
	vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
	vd->vdev_parent->vdev_child[0] == vd)
	vd->vdev_unspare = B_TRUE;
	}

	boolean_t
	vdev_is_dead(vdev_t *vd)
	{
	/*
	* Holes and missing devices are always considered "dead".
	* This simplifies the code since we don't have to check for
	* these types of devices in the various code paths.
	* Instead we rely on the fact that we skip over dead devices
	* before issuing I/O to them.
	*/
	return (vd->vdev_state < VDEV_STATE_DEGRADED \|\| vd->vdev_ishole \|\|
	vd->vdev_ops == &vdev_missing_ops);
	}

	boolean_t
	vdev_readable(vdev_t *vd)
	{
	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
	}

	boolean_t
	vdev_writeable(vdev_t *vd)
	{
	return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
	}

	boolean_t
	vdev_allocatable(vdev_t *vd)
	{
	uint64_t state = vd->vdev_state;

	/*
	* We currently allow allocations from vdevs which may be in the
	* process of reopening (i.e. VDEV_STATE_CLOSED). If the device
	* fails to reopen then we'll catch it later when we're holding
	* the proper locks. Note that we have to get the vdev state
	* in a local variable because although it changes atomically,
	* we're asking two separate questions about it.
	*/
	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
	!vd->vdev_cant_write && !vd->vdev_ishole);
	}

	boolean_t
	vdev_accessible(vdev_t vd, zio_t zio)
	{
	ASSERT(zio->io_vd == vd);

	if (vdev_is_dead(vd) \|\| vd->vdev_remove_wanted)
	return (B_FALSE);

	if (zio->io_type == ZIO_TYPE_READ)
	return (!vd->vdev_cant_read);

	if (zio->io_type == ZIO_TYPE_WRITE)
	return (!vd->vdev_cant_write);

	return (B_TRUE);
	}

	/*
	* Get statistics for the given vdev.
	*/
	void
	vdev_get_stats(vdev_t vd, vdev_stat_t vs)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *rvd = spa->spa_root_vdev;

	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);

	mutex_enter(&vd->vdev_stat_lock);
	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
	vs->vs_state = vd->vdev_state;
	vs->vs_rsize = vdev_get_min_asize(vd);
	if (vd->vdev_ops->vdev_op_leaf)
	vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
	vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
	if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
	vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
	}

	/*
	* If we're getting stats on the root vdev, aggregate the I/O counts
	* over all top-level vdevs (i.e. the direct children of the root).
	*/
	if (vd == rvd) {
	for (int c = 0; c < rvd->vdev_children; c++) {
	vdev_t *cvd = rvd->vdev_child[c];
	vdev_stat_t *cvs = &cvd->vdev_stat;

	for (int t = 0; t < ZIO_TYPES; t++) {
	vs->vs_ops[t] += cvs->vs_ops[t];
	vs->vs_bytes[t] += cvs->vs_bytes[t];
	}
	cvs->vs_scan_removing = cvd->vdev_removing;
	}
	}
	mutex_exit(&vd->vdev_stat_lock);
	}

	void
	vdev_clear_stats(vdev_t *vd)
	{
	mutex_enter(&vd->vdev_stat_lock);
	vd->vdev_stat.vs_space = 0;
	vd->vdev_stat.vs_dspace = 0;
	vd->vdev_stat.vs_alloc = 0;
	mutex_exit(&vd->vdev_stat_lock);
	}

	void
	vdev_scan_stat_init(vdev_t *vd)
	{
	vdev_stat_t *vs = &vd->vdev_stat;

	for (int c = 0; c < vd->vdev_children; c++)
	vdev_scan_stat_init(vd->vdev_child[c]);

	mutex_enter(&vd->vdev_stat_lock);
	vs->vs_scan_processed = 0;
	mutex_exit(&vd->vdev_stat_lock);
	}

	void
	vdev_stat_update(zio_t *zio, uint64_t psize)
	{
	spa_t *spa = zio->io_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
	vdev_t *pvd;
	uint64_t txg = zio->io_txg;
	vdev_stat_t *vs = &vd->vdev_stat;
	zio_type_t type = zio->io_type;
	int flags = zio->io_flags;

	/*
	* If this i/o is a gang leader, it didn't do any actual work.
	*/
	if (zio->io_gang_tree)
	return;

	if (zio->io_error == 0) {
	/*
	* If this is a root i/o, don't count it -- we've already
	* counted the top-level vdevs, and vdev_get_stats() will
	* aggregate them when asked. This reduces contention on
	* the root vdev_stat_lock and implicitly handles blocks
	* that compress away to holes, for which there is no i/o.
	* (Holes never create vdev children, so all the counters
	* remain zero, which is what we want.)
	*
	* Note: this only applies to successful i/o (io_error == 0)
	* because unlike i/o counts, errors are not additive.
	* When reading a ditto block, for example, failure of
	* one top-level vdev does not imply a root-level error.
	*/
	if (vd == rvd)
	return;

	ASSERT(vd == zio->io_vd);

	if (flags & ZIO_FLAG_IO_BYPASS)
	return;

	mutex_enter(&vd->vdev_stat_lock);

	if (flags & ZIO_FLAG_IO_REPAIR) {
	if (flags & ZIO_FLAG_SCAN_THREAD) {
	dsl_scan_phys_t *scn_phys =
	&spa->spa_dsl_pool->dp_scan->scn_phys;
	uint64_t *processed = &scn_phys->scn_processed;

	/* XXX cleanup? */
	if (vd->vdev_ops->vdev_op_leaf)
	atomic_add_64(processed, psize);
	vs->vs_scan_processed += psize;
	}

	if (flags & ZIO_FLAG_SELF_HEAL)
	vs->vs_self_healed += psize;
	}

	vs->vs_ops[type]++;
	vs->vs_bytes[type] += psize;

	mutex_exit(&vd->vdev_stat_lock);
	return;
	}

	if (flags & ZIO_FLAG_SPECULATIVE)
	return;

	/*
	* If this is an I/O error that is going to be retried, then ignore the
	* error. Otherwise, the user may interpret B_FAILFAST I/O errors as
	* hard errors, when in reality they can happen for any number of
	* innocuous reasons (bus resets, MPxIO link failure, etc).
	*/
	if (zio->io_error == EIO &&
	!(zio->io_flags & ZIO_FLAG_IO_RETRY))
	return;

	/*
	* Intent logs writes won't propagate their error to the root
	* I/O so don't mark these types of failures as pool-level
	* errors.
	*/
	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
	return;

	mutex_enter(&vd->vdev_stat_lock);
	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
	if (zio->io_error == ECKSUM)
	vs->vs_checksum_errors++;
	else
	vs->vs_read_errors++;
	}
	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
	vs->vs_write_errors++;
	mutex_exit(&vd->vdev_stat_lock);

	if (type == ZIO_TYPE_WRITE && txg != 0 &&
	(!(flags & ZIO_FLAG_IO_REPAIR) \|\|
	(flags & ZIO_FLAG_SCAN_THREAD) \|\|
	spa->spa_claiming)) {
	/*
	* This is either a normal write (not a repair), or it's
	* a repair induced by the scrub thread, or it's a repair
	* made by zil_claim() during spa_load() in the first txg.
	* In the normal case, we commit the DTL change in the same
	* txg as the block was born. In the scrub-induced repair
	* case, we know that scrubs run in first-pass syncing context,
	* so we commit the DTL change in spa_syncing_txg(spa).
	* In the zil_claim() case, we commit in spa_first_txg(spa).
	*
	* We currently do not make DTL entries for failed spontaneous
	* self-healing writes triggered by normal (non-scrubbing)
	* reads, because we have no transactional context in which to
	* do so -- and it's not clear that it'd be desirable anyway.
	*/
	if (vd->vdev_ops->vdev_op_leaf) {
	uint64_t commit_txg = txg;
	if (flags & ZIO_FLAG_SCAN_THREAD) {
	ASSERT(flags & ZIO_FLAG_IO_REPAIR);
	ASSERT(spa_sync_pass(spa) == 1);
	vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
	commit_txg = spa_syncing_txg(spa);
	} else if (spa->spa_claiming) {
	ASSERT(flags & ZIO_FLAG_IO_REPAIR);
	commit_txg = spa_first_txg(spa);
	}
	ASSERT(commit_txg >= spa_syncing_txg(spa));
	if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
	return;
	for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
	vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
	vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
	}
	if (vd != rvd)
	vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
	}
	}

	/*
	* Update the in-core space usage stats for this vdev, its metaslab class,
	* and the root vdev.
	*/
	void
	vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
	int64_t space_delta)
	{
	int64_t dspace_delta = space_delta;
	spa_t *spa = vd->vdev_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	metaslab_group_t *mg = vd->vdev_mg;
	metaslab_class_t *mc = mg ? mg->mg_class : NULL;

	ASSERT(vd == vd->vdev_top);

	/*
	* Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
	* factor. We must calculate this here and not at the root vdev
	* because the root vdev's psize-to-asize is simply the max of its
	* childrens', thus not accurate enough for us.
	*/
	ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
	ASSERT(vd->vdev_deflate_ratio != 0 \|\| vd->vdev_isl2cache);
	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
	vd->vdev_deflate_ratio;

	mutex_enter(&vd->vdev_stat_lock);
	vd->vdev_stat.vs_alloc += alloc_delta;
	vd->vdev_stat.vs_space += space_delta;
	vd->vdev_stat.vs_dspace += dspace_delta;
	mutex_exit(&vd->vdev_stat_lock);

	if (mc == spa_normal_class(spa)) {
	mutex_enter(&rvd->vdev_stat_lock);
	rvd->vdev_stat.vs_alloc += alloc_delta;
	rvd->vdev_stat.vs_space += space_delta;
	rvd->vdev_stat.vs_dspace += dspace_delta;
	mutex_exit(&rvd->vdev_stat_lock);
	}

	if (mc != NULL) {
	ASSERT(rvd == vd->vdev_parent);
	ASSERT(vd->vdev_ms_count != 0);

	metaslab_class_space_update(mc,
	alloc_delta, defer_delta, space_delta, dspace_delta);
	}
	}

	/*
	* Mark a top-level vdev's config as dirty, placing it on the dirty list
	* so that it will be written out next time the vdev configuration is synced.
	* If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
	*/
	void
	vdev_config_dirty(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	int c;

	ASSERT(spa_writeable(spa));

	/*
	* If this is an aux vdev (as with l2cache and spare devices), then we
	* update the vdev config manually and set the sync flag.
	*/
	if (vd->vdev_aux != NULL) {
	spa_aux_vdev_t *sav = vd->vdev_aux;
	nvlist_t **aux;
	uint_t naux;

	for (c = 0; c < sav->sav_count; c++) {
	if (sav->sav_vdevs[c] == vd)
	break;
	}

	if (c == sav->sav_count) {
	/*
	* We're being removed. There's nothing more to do.
	*/
	ASSERT(sav->sav_sync == B_TRUE);
	return;
	}

	sav->sav_sync = B_TRUE;

	if (nvlist_lookup_nvlist_array(sav->sav_config,
	ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
	VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
	ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
	}

	ASSERT(c < naux);

	/*
	* Setting the nvlist in the middle if the array is a little
	* sketchy, but it will work.
	*/
	nvlist_free(aux[c]);
	aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);

	return;
	}

	/*
	* The dirty list is protected by the SCL_CONFIG lock. The caller
	* must either hold SCL_CONFIG as writer, or must be the sync thread
	* (which holds SCL_CONFIG as reader). There's only one sync thread,
	* so this is sufficient to ensure mutual exclusion.
	*/
	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\|
	(dsl_pool_sync_context(spa_get_dsl(spa)) &&
	spa_config_held(spa, SCL_CONFIG, RW_READER)));

	if (vd == rvd) {
	for (c = 0; c < rvd->vdev_children; c++)
	vdev_config_dirty(rvd->vdev_child[c]);
	} else {
	ASSERT(vd == vd->vdev_top);

	if (!list_link_active(&vd->vdev_config_dirty_node) &&
	!vd->vdev_ishole)
	list_insert_head(&spa->spa_config_dirty_list, vd);
	}
	}

	void
	vdev_config_clean(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) \|\|
	(dsl_pool_sync_context(spa_get_dsl(spa)) &&
	spa_config_held(spa, SCL_CONFIG, RW_READER)));

	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
	list_remove(&spa->spa_config_dirty_list, vd);
	}

	/*
	* Mark a top-level vdev's state as dirty, so that the next pass of
	* spa_sync() can convert this into vdev_config_dirty(). We distinguish
	* the state changes from larger config changes because they require
	* much less locking, and are often needed for administrative actions.
	*/
	void
	vdev_state_dirty(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_writeable(spa));
	ASSERT(vd == vd->vdev_top);

	/*
	* The state list is protected by the SCL_STATE lock. The caller
	* must either hold SCL_STATE as writer, or must be the sync thread
	* (which holds SCL_STATE as reader). There's only one sync thread,
	* so this is sufficient to ensure mutual exclusion.
	*/
	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\|
	(dsl_pool_sync_context(spa_get_dsl(spa)) &&
	spa_config_held(spa, SCL_STATE, RW_READER)));

	if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
	list_insert_head(&spa->spa_state_dirty_list, vd);
	}

	void
	vdev_state_clean(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;

	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) \|\|
	(dsl_pool_sync_context(spa_get_dsl(spa)) &&
	spa_config_held(spa, SCL_STATE, RW_READER)));

	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
	list_remove(&spa->spa_state_dirty_list, vd);
	}

	/*
	* Propagate vdev state up from children to parent.
	*/
	void
	vdev_propagate_state(vdev_t *vd)
	{
	spa_t *spa = vd->vdev_spa;
	vdev_t *rvd = spa->spa_root_vdev;
	int degraded = 0, faulted = 0;
	int corrupted = 0;
	vdev_t *child;

	if (vd->vdev_children > 0) {
	for (int c = 0; c < vd->vdev_children; c++) {
	child = vd->vdev_child[c];

	/*
	* Don't factor holes into the decision.
	*/
	if (child->vdev_ishole)
	continue;

	if (!vdev_readable(child) \|\|
	(!vdev_writeable(child) && spa_writeable(spa))) {
	/*
	* Root special: if there is a top-level log
	* device, treat the root vdev as if it were
	* degraded.
	*/
	if (child->vdev_islog && vd == rvd)
	degraded++;
	else
	faulted++;
	} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
	degraded++;
	}

	if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
	corrupted++;
	}

	vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);

	/*
	* Root special: if there is a top-level vdev that cannot be
	* opened due to corrupted metadata, then propagate the root
	* vdev's aux state as 'corrupt' rather than 'insufficient
	* replicas'.
	*/
	if (corrupted && vd == rvd &&
	rvd->vdev_state == VDEV_STATE_CANT_OPEN)
	vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
	VDEV_AUX_CORRUPT_DATA);
	}

	if (vd->vdev_parent)
	vdev_propagate_state(vd->vdev_parent);
	}

	/*
	* Set a vdev's state. If this is during an open, we don't update the parent
	* state, because we're in the process of opening children depth-first.
	* Otherwise, we propagate the change to the parent.
	*
	* If this routine places a device in a faulted state, an appropriate ereport is
	* generated.
	*/
	void
	vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
	{
	uint64_t save_state;
	spa_t *spa = vd->vdev_spa;

	if (state == vd->vdev_state) {
	vd->vdev_stat.vs_aux = aux;
	return;
	}

	save_state = vd->vdev_state;

	vd->vdev_state = state;
	vd->vdev_stat.vs_aux = aux;

	/*
	* If we are setting the vdev state to anything but an open state, then
	* always close the underlying device unless the device has requested
	* a delayed close (i.e. we're about to remove or fault the device).
	* Otherwise, we keep accessible but invalid devices open forever.
	* We don't call vdev_close() itself, because that implies some extra
	* checks (offline, etc) that we don't want here. This is limited to
	* leaf devices, because otherwise closing the device will affect other
	* children.
	*/
	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
	vd->vdev_ops->vdev_op_leaf)
	vd->vdev_ops->vdev_op_close(vd);

	/*
	* If we have brought this vdev back into service, we need
	* to notify fmd so that it can gracefully repair any outstanding
	* cases due to a missing device. We do this in all cases, even those
	* that probably don't correlate to a repaired fault. This is sure to
	* catch all cases, and we let the zfs-retire agent sort it out. If
	* this is a transient state it's OK, as the retire agent will
	* double-check the state of the vdev before repairing it.
	*/
	if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
	vd->vdev_prevstate != state)
	zfs_post_state_change(spa, vd);

	if (vd->vdev_removed &&
	state == VDEV_STATE_CANT_OPEN &&
	(aux == VDEV_AUX_OPEN_FAILED \|\| vd->vdev_checkremove)) {
	/*
	* If the previous state is set to VDEV_STATE_REMOVED, then this
	* device was previously marked removed and someone attempted to
	* reopen it. If this failed due to a nonexistent device, then
	* keep the device in the REMOVED state. We also let this be if
	* it is one of our special test online cases, which is only
	* attempting to online the device and shouldn't generate an FMA
	* fault.
	*/
	vd->vdev_state = VDEV_STATE_REMOVED;
	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
	} else if (state == VDEV_STATE_REMOVED) {
	vd->vdev_removed = B_TRUE;
	} else if (state == VDEV_STATE_CANT_OPEN) {
	/*
	* If we fail to open a vdev during an import or recovery, we
	* mark it as "not available", which signifies that it was
	* never there to begin with. Failure to open such a device
	* is not considered an error.
	*/
	if ((spa_load_state(spa) == SPA_LOAD_IMPORT \|\|
	spa_load_state(spa) == SPA_LOAD_RECOVER) &&
	vd->vdev_ops->vdev_op_leaf)
	vd->vdev_not_present = 1;

	/*
	* Post the appropriate ereport. If the 'prevstate' field is
	* set to something other than VDEV_STATE_UNKNOWN, it indicates
	* that this is part of a vdev_reopen(). In this case, we don't
	* want to post the ereport if the device was already in the
	* CANT_OPEN state beforehand.
	*
	* If the 'checkremove' flag is set, then this is an attempt to
	* online the device in response to an insertion event. If we
	* hit this case, then we have detected an insertion event for a
	* faulted or offline device that wasn't in the removed state.
	* In this scenario, we don't post an ereport because we are
	* about to replace the device, or attempt an online with
	* vdev_forcefault, which will generate the fault for us.
	*/
	if ((vd->vdev_prevstate != state \|\| vd->vdev_forcefault) &&
	!vd->vdev_not_present && !vd->vdev_checkremove &&
	vd != spa->spa_root_vdev) {
	const char *class;

	switch (aux) {
	case VDEV_AUX_OPEN_FAILED:
	class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
	break;
	case VDEV_AUX_CORRUPT_DATA:
	class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
	break;
	case VDEV_AUX_NO_REPLICAS:
	class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
	break;
	case VDEV_AUX_BAD_GUID_SUM:
	class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
	break;
	case VDEV_AUX_TOO_SMALL:
	class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
	break;
	case VDEV_AUX_BAD_LABEL:
	class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
	break;
	default:
	class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
	}

	zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
	}

	/* Erase any notion of persistent removed state */
	vd->vdev_removed = B_FALSE;
	} else {
	vd->vdev_removed = B_FALSE;
	}

	if (!isopen && vd->vdev_parent)
	vdev_propagate_state(vd->vdev_parent);
	}

	/*
	* Check the vdev configuration to ensure that it's capable of supporting
	* a root pool. Currently, we do not support RAID-Z or partial configuration.
	* In addition, only a single top-level vdev is allowed and none of the leaves
	* can be wholedisks.
	*/
	boolean_t
	vdev_is_bootable(vdev_t *vd)
	{
	if (!vd->vdev_ops->vdev_op_leaf) {
	char *vdev_type = vd->vdev_ops->vdev_op_type;

	if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
	vd->vdev_children > 1) {
	return (B_FALSE);
	} else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 \|\|
	strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
	return (B_FALSE);
	}
	} else if (vd->vdev_wholedisk == 1) {
	return (B_FALSE);
	}

	for (int c = 0; c < vd->vdev_children; c++) {
	if (!vdev_is_bootable(vd->vdev_child[c]))
	return (B_FALSE);
	}
	return (B_TRUE);
	}

	/*
	* Load the state from the original vdev tree (ovd) which
	* we've retrieved from the MOS config object. If the original
	* vdev was offline or faulted then we transfer that state to the
	* device in the current vdev tree (nvd).
	*/
	void
	vdev_load_log_state(vdev_t nvd, vdev_t ovd)
	{
	spa_t *spa = nvd->vdev_spa;

	ASSERT(nvd->vdev_top->vdev_islog);
	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);

	for (int c = 0; c < nvd->vdev_children; c++)
	vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);

	if (nvd->vdev_ops->vdev_op_leaf) {
	/*
	* Restore the persistent vdev state
	*/
	nvd->vdev_offline = ovd->vdev_offline;
	nvd->vdev_faulted = ovd->vdev_faulted;
	nvd->vdev_degraded = ovd->vdev_degraded;
	nvd->vdev_removed = ovd->vdev_removed;
	}
	}

	/*
	* Determine if a log device has valid content. If the vdev was
	* removed or faulted in the MOS config then we know that
	* the content on the log device has already been written to the pool.
	*/
	boolean_t
	vdev_log_state_valid(vdev_t *vd)
	{
	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
	!vd->vdev_removed)
	return (B_TRUE);

	for (int c = 0; c < vd->vdev_children; c++)
	if (vdev_log_state_valid(vd->vdev_child[c]))
	return (B_TRUE);

	return (B_FALSE);
	}

	/*
	* Expand a vdev if possible.
	*/
	void
	vdev_expand(vdev_t *vd, uint64_t txg)
	{
	ASSERT(vd->vdev_top == vd);
	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);

	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
	VERIFY(vdev_metaslab_init(vd, txg) == 0);
	vdev_config_dirty(vd);
	}
	}

	/*
	* Split a vdev.
	*/
	void
	vdev_split(vdev_t *vd)
	{
	vdev_t cvd, pvd = vd->vdev_parent;

	vdev_remove_child(pvd, vd);
	vdev_compact_children(pvd);

	cvd = pvd->vdev_child[0];
	if (pvd->vdev_children == 1) {
	vdev_remove_parent(cvd);
	cvd->vdev_splitting = B_TRUE;
	}
	vdev_propagate_state(cvd);
	}

	void
	vdev_deadman(vdev_t *vd)
	{
	for (int c = 0; c < vd->vdev_children; c++) {
	vdev_t *cvd = vd->vdev_child[c];

	vdev_deadman(cvd);
	}

	if (vd->vdev_ops->vdev_op_leaf) {
	vdev_queue_t *vq = &vd->vdev_queue;

	mutex_enter(&vq->vq_lock);
	if (avl_numnodes(&vq->vq_active_tree) > 0) {
	spa_t *spa = vd->vdev_spa;
	zio_t *fio;
	uint64_t delta;

	/*
	* Look at the head of all the pending queues,
	* if any I/O has been outstanding for longer than
	* the spa_deadman_synctime we panic the system.
	*/
	fio = avl_first(&vq->vq_active_tree);
	delta = gethrtime() - fio->io_timestamp;
	if (delta > spa_deadman_synctime(spa)) {
	zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
	"delta %lluns, last io %lluns",
	fio->io_timestamp, delta,
	vq->vq_io_complete_ts);
	fm_panic("I/O to pool '%s' appears to be "
	"hung.", spa_name(spa));
	}
	}
	mutex_exit(&vq->vq_lock);
	}
	}
	Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
	===================================================================
	--- vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c (revision 284761)
	+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c (revision 284762)
	@@ -1,2133 +1,2132 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
	*/

	/* Portions Copyright 2010 Robert Milkowski */

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	#include <sys/dmu.h>
	#include <sys/zap.h>
	#include <sys/arc.h>
	#include <sys/stat.h>
	#include <sys/resource.h>
	#include <sys/zil.h>
	#include <sys/zil_impl.h>
	#include <sys/dsl_dataset.h>
	#include <sys/vdev_impl.h>
	#include <sys/dmu_tx.h>
	#include <sys/dsl_pool.h>

	/*
	* The zfs intent log (ZIL) saves transaction records of system calls
	* that change the file system in memory with enough information
	* to be able to replay them. These are stored in memory until
	* either the DMU transaction group (txg) commits them to the stable pool
	* and they can be discarded, or they are flushed to the stable log
	* (also in the pool) due to a fsync, O_DSYNC or other synchronous
	* requirement. In the event of a panic or power fail then those log
	* records (transactions) are replayed.
	*
	* There is one ZIL per file system. Its on-disk (pool) format consists
	* of 3 parts:
	*
	* - ZIL header
	* - ZIL blocks
	* - ZIL records
	*
	* A log record holds a system call transaction. Log blocks can
	* hold many log records and the blocks are chained together.
	* Each ZIL block contains a block pointer (blkptr_t) to the next
	* ZIL block in the chain. The ZIL header points to the first
	* block in the chain. Note there is not a fixed place in the pool
	* to hold blocks. They are dynamically allocated and freed as
	* needed from the blocks available. Figure X shows the ZIL structure:
	*/

	/*
	* Disable intent logging replay. This global ZIL switch affects all pools.
	*/
	int zil_replay_disable = 0;

	/*
	* Tunable parameter for debugging or performance analysis. Setting
	* zfs_nocacheflush will cause corruption on power loss if a volatile
	* out-of-order write cache is enabled.
	*/
	boolean_t zfs_nocacheflush = B_FALSE;

	static kmem_cache_t *zil_lwb_cache;

	static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);

	#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
	sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))


	/*
	* ziltest is by and large an ugly hack, but very useful in
	* checking replay without tedious work.
	* When running ziltest we want to keep all itx's and so maintain
	* a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
	* We subtract TXG_CONCURRENT_STATES to allow for common code.
	*/
	#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)

	static int
	zil_bp_compare(const void x1, const void x2)
	{
	const dva_t dva1 = &((zil_bp_node_t )x1)->zn_dva;
	const dva_t dva2 = &((zil_bp_node_t )x2)->zn_dva;

	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
	return (-1);
	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
	return (1);

	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
	return (-1);
	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
	return (1);

	return (0);
	}

	static void
	zil_bp_tree_init(zilog_t *zilog)
	{
	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
	sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
	}

	static void
	zil_bp_tree_fini(zilog_t *zilog)
	{
	avl_tree_t *t = &zilog->zl_bp_tree;
	zil_bp_node_t *zn;
	void *cookie = NULL;

	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
	kmem_free(zn, sizeof (zil_bp_node_t));

	avl_destroy(t);
	}

	int
	zil_bp_tree_add(zilog_t zilog, const blkptr_t bp)
	{
	avl_tree_t *t = &zilog->zl_bp_tree;
	const dva_t *dva;
	zil_bp_node_t *zn;
	avl_index_t where;

	if (BP_IS_EMBEDDED(bp))
	return (0);

	dva = BP_IDENTITY(bp);

	if (avl_find(t, dva, &where) != NULL)
	return (SET_ERROR(EEXIST));

	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
	zn->zn_dva = *dva;
	avl_insert(t, zn, where);

	return (0);
	}

	static zil_header_t *
	zil_header_in_syncing_context(zilog_t *zilog)
	{
	return ((zil_header_t *)zilog->zl_header);
	}

	static void
	zil_init_log_chain(zilog_t zilog, blkptr_t bp)
	{
	zio_cksum_t *zc = &bp->blk_cksum;

	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
	}

	/*
	* Read a log block and make sure it's valid.
	*/
	static int
	zil_read_log_block(zilog_t zilog, const blkptr_t bp, blkptr_t nbp, void dst,
	char **end)
	{
	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
	arc_flags_t aflags = ARC_FLAG_WAIT;
	arc_buf_t *abuf = NULL;
	zbookmark_phys_t zb;
	int error;

	if (zilog->zl_header->zh_claim_txg == 0)
	zio_flags \|= ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB;

	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
	zio_flags \|= ZIO_FLAG_SPECULATIVE;

	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
	ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);

	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
	ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);

	if (error == 0) {
	zio_cksum_t cksum = bp->blk_cksum;

	/*
	* Validate the checksummed log block.
	*
	* Sequence numbers should be... sequential. The checksum
	* verifier for the next block should be bp's checksum plus 1.
	*
	* Also check the log chain linkage and size used.
	*/
	cksum.zc_word[ZIL_ZC_SEQ]++;

	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
	zil_chain_t *zilc = abuf->b_data;
	char lr = (char )(zilc + 1);
	uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);

	if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
	sizeof (cksum)) \|\| BP_IS_HOLE(&zilc->zc_next_blk)) {
	error = SET_ERROR(ECKSUM);
	} else {
	ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
	bcopy(lr, dst, len);
	end = (char )dst + len;
	*nbp = zilc->zc_next_blk;
	}
	} else {
	char *lr = abuf->b_data;
	uint64_t size = BP_GET_LSIZE(bp);
	zil_chain_t zilc = (zil_chain_t )(lr + size) - 1;

	if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
	sizeof (cksum)) \|\| BP_IS_HOLE(&zilc->zc_next_blk) \|\|
	(zilc->zc_nused > (size - sizeof (*zilc)))) {
	error = SET_ERROR(ECKSUM);
	} else {
	ASSERT3U(zilc->zc_nused, <=,
	SPA_OLD_MAXBLOCKSIZE);
	bcopy(lr, dst, zilc->zc_nused);
	end = (char )dst + zilc->zc_nused;
	*nbp = zilc->zc_next_blk;
	}
	}

	VERIFY(arc_buf_remove_ref(abuf, &abuf));
	}

	return (error);
	}

	/*
	* Read a TX_WRITE log data block.
	*/
	static int
	zil_read_log_data(zilog_t zilog, const lr_write_t lr, void *wbuf)
	{
	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
	const blkptr_t *bp = &lr->lr_blkptr;
	arc_flags_t aflags = ARC_FLAG_WAIT;
	arc_buf_t *abuf = NULL;
	zbookmark_phys_t zb;
	int error;

	if (BP_IS_HOLE(bp)) {
	if (wbuf != NULL)
	bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
	return (0);
	}

	if (zilog->zl_header->zh_claim_txg == 0)
	zio_flags \|= ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB;

	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
	ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));

	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
	ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);

	if (error == 0) {
	if (wbuf != NULL)
	bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
	(void) arc_buf_remove_ref(abuf, &abuf);
	}

	return (error);
	}

	/*
	* Parse the intent log, and call parse_func for each valid record within.
	*/
	int
	zil_parse(zilog_t zilog, zil_parse_blk_func_t parse_blk_func,
	zil_parse_lr_func_t parse_lr_func, void arg, uint64_t txg)
	{
	const zil_header_t *zh = zilog->zl_header;
	boolean_t claimed = !!zh->zh_claim_txg;
	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
	uint64_t max_blk_seq = 0;
	uint64_t max_lr_seq = 0;
	uint64_t blk_count = 0;
	uint64_t lr_count = 0;
	blkptr_t blk, next_blk;
	char lrbuf, lrp;
	int error = 0;

	/*
	* Old logs didn't record the maximum zh_claim_lr_seq.
	*/
	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
	claim_lr_seq = UINT64_MAX;

	/*
	* Starting at the block pointed to by zh_log we read the log chain.
	* For each block in the chain we strongly check that block to
	* ensure its validity. We stop when an invalid block is found.
	* For each block pointer in the chain we call parse_blk_func().
	* For each record in each valid block we call parse_lr_func().
	* If the log has been claimed, stop if we encounter a sequence
	* number greater than the highest claimed sequence number.
	*/
	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
	zil_bp_tree_init(zilog);

	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
	uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
	int reclen;
	char *end;

	if (blk_seq > claim_blk_seq)
	break;
	if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
	break;
	ASSERT3U(max_blk_seq, <, blk_seq);
	max_blk_seq = blk_seq;
	blk_count++;

	if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
	break;

	error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
	if (error != 0)
	break;

	for (lrp = lrbuf; lrp < end; lrp += reclen) {
	lr_t lr = (lr_t )lrp;
	reclen = lr->lrc_reclen;
	ASSERT3U(reclen, >=, sizeof (lr_t));
	if (lr->lrc_seq > claim_lr_seq)
	goto done;
	if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
	goto done;
	ASSERT3U(max_lr_seq, <, lr->lrc_seq);
	max_lr_seq = lr->lrc_seq;
	lr_count++;
	}
	}
	done:
	zilog->zl_parse_error = error;
	zilog->zl_parse_blk_seq = max_blk_seq;
	zilog->zl_parse_lr_seq = max_lr_seq;
	zilog->zl_parse_blk_count = blk_count;
	zilog->zl_parse_lr_count = lr_count;

	ASSERT(!claimed \|\| !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) \|\|
	(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));

	zil_bp_tree_fini(zilog);
	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);

	return (error);
	}

	static int
	zil_claim_log_block(zilog_t zilog, blkptr_t bp, void *tx, uint64_t first_txg)
	{
	/*
	* Claim log block if not already committed and not already claimed.
	* If tx == NULL, just verify that the block is claimable.
	*/
	if (BP_IS_HOLE(bp) \|\| bp->blk_birth < first_txg \|\|
	zil_bp_tree_add(zilog, bp) != 0)
	return (0);

	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
	tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_SPECULATIVE \| ZIO_FLAG_SCRUB)));
	}

	static int
	zil_claim_log_record(zilog_t zilog, lr_t lrc, void *tx, uint64_t first_txg)
	{
	lr_write_t lr = (lr_write_t )lrc;
	int error;

	if (lrc->lrc_txtype != TX_WRITE)
	return (0);

	/*
	* If the block is not readable, don't claim it. This can happen
	* in normal operation when a log block is written to disk before
	* some of the dmu_sync() blocks it points to. In this case, the
	* transaction cannot have been committed to anyone (we would have
	* waited for all writes to be stable first), so it is semantically
	* correct to declare this the end of the log.
	*/
	if (lr->lr_blkptr.blk_birth >= first_txg &&
	(error = zil_read_log_data(zilog, lr, NULL)) != 0)
	return (error);
	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
	}

	/* ARGSUSED */
	static int
	zil_free_log_block(zilog_t zilog, blkptr_t bp, void *tx, uint64_t claim_txg)
	{
	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);

	return (0);
	}

	static int
	zil_free_log_record(zilog_t zilog, lr_t lrc, void *tx, uint64_t claim_txg)
	{
	lr_write_t lr = (lr_write_t )lrc;
	blkptr_t *bp = &lr->lr_blkptr;

	/*
	* If we previously claimed it, we need to free it.
	*/
	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
	bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
	!BP_IS_HOLE(bp))
	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);

	return (0);
	}

	static lwb_t *
	zil_alloc_lwb(zilog_t zilog, blkptr_t bp, uint64_t txg)
	{
	lwb_t *lwb;

	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
	lwb->lwb_zilog = zilog;
	lwb->lwb_blk = *bp;
	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
	lwb->lwb_max_txg = txg;
	lwb->lwb_zio = NULL;
	lwb->lwb_tx = NULL;
	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
	lwb->lwb_nused = sizeof (zil_chain_t);
	lwb->lwb_sz = BP_GET_LSIZE(bp);
	} else {
	lwb->lwb_nused = 0;
	lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
	}

	mutex_enter(&zilog->zl_lock);
	list_insert_tail(&zilog->zl_lwb_list, lwb);
	mutex_exit(&zilog->zl_lock);

	return (lwb);
	}

	/*
	* Called when we create in-memory log transactions so that we know
	* to cleanup the itxs at the end of spa_sync().
	*/
	void
	zilog_dirty(zilog_t *zilog, uint64_t txg)
	{
	dsl_pool_t *dp = zilog->zl_dmu_pool;
	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);

	if (ds->ds_is_snapshot)
	panic("dirtying snapshot!");

	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
	/* up the hold count until we can be written out */
	dmu_buf_add_ref(ds->ds_dbuf, zilog);
	}
	}

	boolean_t
	zilog_is_dirty(zilog_t *zilog)
	{
	dsl_pool_t *dp = zilog->zl_dmu_pool;

	for (int t = 0; t < TXG_SIZE; t++) {
	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
	return (B_TRUE);
	}
	return (B_FALSE);
	}

	/*
	* Create an on-disk intent log.
	*/
	static lwb_t *
	zil_create(zilog_t *zilog)
	{
	const zil_header_t *zh = zilog->zl_header;
	lwb_t *lwb = NULL;
	uint64_t txg = 0;
	dmu_tx_t *tx = NULL;
	blkptr_t blk;
	int error = 0;

	/*
	* Wait for any previous destroy to complete.
	*/
	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);

	ASSERT(zh->zh_claim_txg == 0);
	ASSERT(zh->zh_replay_seq == 0);

	blk = zh->zh_log;

	/*
	* Allocate an initial log block if:
	* - there isn't one already
	* - the existing block is the wrong endianess
	*/
	if (BP_IS_HOLE(&blk) \|\| BP_SHOULD_BYTESWAP(&blk)) {
	tx = dmu_tx_create(zilog->zl_os);
	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	txg = dmu_tx_get_txg(tx);

	if (!BP_IS_HOLE(&blk)) {
	zio_free_zil(zilog->zl_spa, txg, &blk);
	BP_ZERO(&blk);
	}

	error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
	ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);

	if (error == 0)
	zil_init_log_chain(zilog, &blk);
	}

	/*
	* Allocate a log write buffer (lwb) for the first log block.
	*/
	if (error == 0)
	lwb = zil_alloc_lwb(zilog, &blk, txg);

	/*
	* If we just allocated the first log block, commit our transaction
	* and wait for zil_sync() to stuff the block poiner into zh_log.
	* (zh is part of the MOS, so we cannot modify it in open context.)
	*/
	if (tx != NULL) {
	dmu_tx_commit(tx);
	txg_wait_synced(zilog->zl_dmu_pool, txg);
	}

	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);

	return (lwb);
	}

	/*
	* In one tx, free all log blocks and clear the log header.
	* If keep_first is set, then we're replaying a log with no content.
	* We want to keep the first block, however, so that the first
	* synchronous transaction doesn't require a txg_wait_synced()
	* in zil_create(). We don't need to txg_wait_synced() here either
	* when keep_first is set, because both zil_create() and zil_destroy()
	* will wait for any in-progress destroys to complete.
	*/
	void
	zil_destroy(zilog_t *zilog, boolean_t keep_first)
	{
	const zil_header_t *zh = zilog->zl_header;
	lwb_t *lwb;
	dmu_tx_t *tx;
	uint64_t txg;

	/*
	* Wait for any previous destroy to complete.
	*/
	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);

	zilog->zl_old_header = zh; / debugging aid */

	if (BP_IS_HOLE(&zh->zh_log))
	return;

	tx = dmu_tx_create(zilog->zl_os);
	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	txg = dmu_tx_get_txg(tx);

	mutex_enter(&zilog->zl_lock);

	ASSERT3U(zilog->zl_destroy_txg, <, txg);
	zilog->zl_destroy_txg = txg;
	zilog->zl_keep_first = keep_first;

	if (!list_is_empty(&zilog->zl_lwb_list)) {
	ASSERT(zh->zh_claim_txg == 0);
	VERIFY(!keep_first);
	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
	list_remove(&zilog->zl_lwb_list, lwb);
	if (lwb->lwb_buf != NULL)
	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
	zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
	kmem_cache_free(zil_lwb_cache, lwb);
	}
	} else if (!keep_first) {
	zil_destroy_sync(zilog, tx);
	}
	mutex_exit(&zilog->zl_lock);

	dmu_tx_commit(tx);
	}

	void
	zil_destroy_sync(zilog_t zilog, dmu_tx_t tx)
	{
	ASSERT(list_is_empty(&zilog->zl_lwb_list));
	(void) zil_parse(zilog, zil_free_log_block,
	zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
	}

	int
	-zil_claim(const char osname, void txarg)
	+zil_claim(dsl_pool_t dp, dsl_dataset_t ds, void *txarg)
	{
	dmu_tx_t *tx = txarg;
	uint64_t first_txg = dmu_tx_get_txg(tx);
	zilog_t *zilog;
	zil_header_t *zh;
	objset_t *os;
	int error;

	- error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os);
	+ error = dmu_objset_own_obj(dp, ds->ds_object,
	+ DMU_OST_ANY, B_FALSE, FTAG, &os);
	if (error != 0) {
	/*
	* EBUSY indicates that the objset is inconsistent, in which
	* case it can not have a ZIL.
	*/
	if (error != EBUSY) {
	- cmn_err(CE_WARN, "can't open objset for %s, error %u",
	- osname, error);
	+ cmn_err(CE_WARN, "can't open objset for %llu, error %u",
	+ (unsigned long long)ds->ds_object, error);
	}
	return (0);
	}

	zilog = dmu_objset_zil(os);
	zh = zil_header_in_syncing_context(zilog);

	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
	if (!BP_IS_HOLE(&zh->zh_log))
	zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
	BP_ZERO(&zh->zh_log);
	dsl_dataset_dirty(dmu_objset_ds(os), tx);
	dmu_objset_disown(os, FTAG);
	return (0);
	}

	/*
	* Claim all log blocks if we haven't already done so, and remember
	* the highest claimed sequence number. This ensures that if we can
	* read only part of the log now (e.g. due to a missing device),
	* but we can read the entire log later, we will not try to replay
	* or destroy beyond the last block we successfully claimed.
	*/
	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
	(void) zil_parse(zilog, zil_claim_log_block,
	zil_claim_log_record, tx, first_txg);
	zh->zh_claim_txg = first_txg;
	zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
	zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
	if (zilog->zl_parse_lr_count \|\| zilog->zl_parse_blk_count > 1)
	zh->zh_flags \|= ZIL_REPLAY_NEEDED;
	zh->zh_flags \|= ZIL_CLAIM_LR_SEQ_VALID;
	dsl_dataset_dirty(dmu_objset_ds(os), tx);
	}

	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
	dmu_objset_disown(os, FTAG);
	return (0);
	}

	/*
	* Check the log by walking the log chain.
	* Checksum errors are ok as they indicate the end of the chain.
	* Any other error (no device or read failure) returns an error.
	*/
	+/* ARGSUSED */
	int
	-zil_check_log_chain(const char osname, void tx)
	+zil_check_log_chain(dsl_pool_t dp, dsl_dataset_t ds, void *tx)
	{
	zilog_t *zilog;
	objset_t *os;
	blkptr_t *bp;
	int error;

	ASSERT(tx == NULL);

	- error = dmu_objset_hold(osname, FTAG, &os);
	+ error = dmu_objset_from_ds(ds, &os);
	if (error != 0) {
	- cmn_err(CE_WARN, "can't open objset for %s", osname);
	+ cmn_err(CE_WARN, "can't open objset %llu, error %d",
	+ (unsigned long long)ds->ds_object, error);
	return (0);
	}

	zilog = dmu_objset_zil(os);
	bp = (blkptr_t *)&zilog->zl_header->zh_log;

	/*
	* Check the first block and determine if it's on a log device
	* which may have been removed or faulted prior to loading this
	* pool. If so, there's no point in checking the rest of the log
	* as its content should have already been synced to the pool.
	*/
	if (!BP_IS_HOLE(bp)) {
	vdev_t *vd;
	boolean_t valid = B_TRUE;

	spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
	vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
	if (vd->vdev_islog && vdev_is_dead(vd))
	valid = vdev_log_state_valid(vd);
	spa_config_exit(os->os_spa, SCL_STATE, FTAG);

	- if (!valid) {
	- dmu_objset_rele(os, FTAG);
	+ if (!valid)
	return (0);
	- }
	}

	/*
	* Because tx == NULL, zil_claim_log_block() will not actually claim
	* any blocks, but just determine whether it is possible to do so.
	* In addition to checking the log chain, zil_claim_log_block()
	* will invoke zio_claim() with a done func of spa_claim_notify(),
	* which will update spa_max_claim_txg. See spa_load() for details.
	*/
	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
	zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
	-
	- dmu_objset_rele(os, FTAG);

	return ((error == ECKSUM \|\| error == ENOENT) ? 0 : error);
	}

	static int
	zil_vdev_compare(const void x1, const void x2)
	{
	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;

	if (v1 < v2)
	return (-1);
	if (v1 > v2)
	return (1);

	return (0);
	}

	void
	zil_add_block(zilog_t zilog, const blkptr_t bp)
	{
	avl_tree_t *t = &zilog->zl_vdev_tree;
	avl_index_t where;
	zil_vdev_node_t *zv, zvsearch;
	int ndvas = BP_GET_NDVAS(bp);
	int i;

	if (zfs_nocacheflush)
	return;

	ASSERT(zilog->zl_writer);

	/*
	* Even though we're zl_writer, we still need a lock because the
	* zl_get_data() callbacks may have dmu_sync() done callbacks
	* that will run concurrently.
	*/
	mutex_enter(&zilog->zl_vdev_lock);
	for (i = 0; i < ndvas; i++) {
	zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
	if (avl_find(t, &zvsearch, &where) == NULL) {
	zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
	zv->zv_vdev = zvsearch.zv_vdev;
	avl_insert(t, zv, where);
	}
	}
	mutex_exit(&zilog->zl_vdev_lock);
	}

	static void
	zil_flush_vdevs(zilog_t *zilog)
	{
	spa_t *spa = zilog->zl_spa;
	avl_tree_t *t = &zilog->zl_vdev_tree;
	void *cookie = NULL;
	zil_vdev_node_t *zv;
	zio_t *zio;

	ASSERT(zilog->zl_writer);

	/*
	* We don't need zl_vdev_lock here because we're the zl_writer,
	* and all zl_get_data() callbacks are done.
	*/
	if (avl_numnodes(t) == 0)
	return;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
	vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
	if (vd != NULL)
	zio_flush(zio, vd);
	kmem_free(zv, sizeof (*zv));
	}

	/*
	* Wait for all the flushes to complete. Not all devices actually
	* support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
	*/
	(void) zio_wait(zio);

	spa_config_exit(spa, SCL_STATE, FTAG);
	}

	/*
	* Function called when a log block write completes
	*/
	static void
	zil_lwb_write_done(zio_t *zio)
	{
	lwb_t *lwb = zio->io_private;
	zilog_t *zilog = lwb->lwb_zilog;
	dmu_tx_t *tx = lwb->lwb_tx;

	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
	ASSERT(!BP_IS_GANG(zio->io_bp));
	ASSERT(!BP_IS_HOLE(zio->io_bp));
	ASSERT(BP_GET_FILL(zio->io_bp) == 0);

	/*
	* Ensure the lwb buffer pointer is cleared before releasing
	* the txg. If we have had an allocation failure and
	* the txg is waiting to sync then we want want zil_sync()
	* to remove the lwb so that it's not picked up as the next new
	* one in zil_commit_writer(). zil_sync() will only remove
	* the lwb if lwb_buf is null.
	*/
	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
	mutex_enter(&zilog->zl_lock);
	lwb->lwb_buf = NULL;
	lwb->lwb_tx = NULL;
	mutex_exit(&zilog->zl_lock);

	/*
	* Now that we've written this log block, we have a stable pointer
	* to the next block in the chain, so it's OK to let the txg in
	* which we allocated the next block sync.
	*/
	dmu_tx_commit(tx);
	}

	/*
	* Initialize the io for a log block.
	*/
	static void
	zil_lwb_write_init(zilog_t zilog, lwb_t lwb)
	{
	zbookmark_phys_t zb;

	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
	ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
	lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);

	if (zilog->zl_root_zio == NULL) {
	zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
	ZIO_FLAG_CANFAIL);
	}
	if (lwb->lwb_zio == NULL) {
	lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
	0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
	zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
	ZIO_FLAG_CANFAIL \| ZIO_FLAG_DONT_PROPAGATE, &zb);
	}
	}

	/*
	* Define a limited set of intent log block sizes.
	*
	* These must be a multiple of 4KB. Note only the amount used (again
	* aligned to 4KB) actually gets written. However, we can't always just
	* allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
	*/
	uint64_t zil_block_buckets[] = {
	4096, /* non TX_WRITE */
	8192+4096, /* data base */
	321024 + 4096, / NFS writes */
	UINT64_MAX
	};

	/*
	* Use the slog as long as the logbias is 'latency' and the current commit size
	* is less than the limit or the total list size is less than 2X the limit.
	* Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
	*/
	uint64_t zil_slog_limit = 1024 * 1024;
	#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
	(((zilog)->zl_cur_used < zil_slog_limit) \|\| \
	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))

	/*
	* Start a log block write and advance to the next log block.
	* Calls are serialized.
	*/
	static lwb_t *
	zil_lwb_write_start(zilog_t zilog, lwb_t lwb)
	{
	lwb_t *nlwb = NULL;
	zil_chain_t *zilc;
	spa_t *spa = zilog->zl_spa;
	blkptr_t *bp;
	dmu_tx_t *tx;
	uint64_t txg;
	uint64_t zil_blksz, wsz;
	int i, error;

	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
	zilc = (zil_chain_t *)lwb->lwb_buf;
	bp = &zilc->zc_next_blk;
	} else {
	zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
	bp = &zilc->zc_next_blk;
	}

	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);

	/*
	* Allocate the next block and save its address in this block
	* before writing it in order to establish the log chain.
	* Note that if the allocation of nlwb synced before we wrote
	* the block that points at it (lwb), we'd leak it if we crashed.
	* Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
	* We dirty the dataset to ensure that zil_sync() will be called
	* to clean up in the event of allocation failure or I/O failure.
	*/
	tx = dmu_tx_create(zilog->zl_os);
	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	txg = dmu_tx_get_txg(tx);

	lwb->lwb_tx = tx;

	/*
	* Log blocks are pre-allocated. Here we select the size of the next
	* block, based on size used in the last block.
	* - first find the smallest bucket that will fit the block from a
	* limited set of block sizes. This is because it's faster to write
	* blocks allocated from the same metaslab as they are adjacent or
	* close.
	* - next find the maximum from the new suggested size and an array of
	* previous sizes. This lessens a picket fence effect of wrongly
	* guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
	* requests.
	*
	* Note we only write what is used, but we can't just allocate
	* the maximum block size because we can exhaust the available
	* pool log space.
	*/
	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
	for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
	continue;
	zil_blksz = zil_block_buckets[i];
	if (zil_blksz == UINT64_MAX)
	zil_blksz = SPA_OLD_MAXBLOCKSIZE;
	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
	for (i = 0; i < ZIL_PREV_BLKS; i++)
	zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);

	BP_ZERO(bp);
	/* pass the old blkptr in order to spread log blocks across devs */
	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
	USE_SLOG(zilog));
	if (error == 0) {
	ASSERT3U(bp->blk_birth, ==, txg);
	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;

	/*
	* Allocate a new log write buffer (lwb).
	*/
	nlwb = zil_alloc_lwb(zilog, bp, txg);

	/* Record the block for later vdev flushing */
	zil_add_block(zilog, &lwb->lwb_blk);
	}

	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
	/* For Slim ZIL only write what is used. */
	wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
	ASSERT3U(wsz, <=, lwb->lwb_sz);
	zio_shrink(lwb->lwb_zio, wsz);

	} else {
	wsz = lwb->lwb_sz;
	}

	zilc->zc_pad = 0;
	zilc->zc_nused = lwb->lwb_nused;
	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;

	/*
	* clear unused data for security
	*/
	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);

	zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */

	/*
	* If there was an allocation failure then nlwb will be null which
	* forces a txg_wait_synced().
	*/
	return (nlwb);
	}

	static lwb_t *
	zil_lwb_commit(zilog_t zilog, itx_t itx, lwb_t *lwb)
	{
	lr_t lrc = &itx->itx_lr; / common log record */
	lr_write_t lrw = (lr_write_t )lrc;
	char *lr_buf;
	uint64_t txg = lrc->lrc_txg;
	uint64_t reclen = lrc->lrc_reclen;
	uint64_t dlen = 0;

	if (lwb == NULL)
	return (NULL);

	ASSERT(lwb->lwb_buf != NULL);
	ASSERT(zilog_is_dirty(zilog) \|\|
	spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);

	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
	dlen = P2ROUNDUP_TYPED(
	lrw->lr_length, sizeof (uint64_t), uint64_t);

	zilog->zl_cur_used += (reclen + dlen);

	zil_lwb_write_init(zilog, lwb);

	/*
	* If this record won't fit in the current log block, start a new one.
	*/
	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
	lwb = zil_lwb_write_start(zilog, lwb);
	if (lwb == NULL)
	return (NULL);
	zil_lwb_write_init(zilog, lwb);
	ASSERT(LWB_EMPTY(lwb));
	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
	txg_wait_synced(zilog->zl_dmu_pool, txg);
	return (lwb);
	}
	}

	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
	bcopy(lrc, lr_buf, reclen);
	lrc = (lr_t *)lr_buf;
	lrw = (lr_write_t *)lrc;

	/*
	* If it's a write, fetch the data or get its blkptr as appropriate.
	*/
	if (lrc->lrc_txtype == TX_WRITE) {
	if (txg > spa_freeze_txg(zilog->zl_spa))
	txg_wait_synced(zilog->zl_dmu_pool, txg);
	if (itx->itx_wr_state != WR_COPIED) {
	char *dbuf;
	int error;

	if (dlen) {
	ASSERT(itx->itx_wr_state == WR_NEED_COPY);
	dbuf = lr_buf + reclen;
	lrw->lr_common.lrc_reclen += dlen;
	} else {
	ASSERT(itx->itx_wr_state == WR_INDIRECT);
	dbuf = NULL;
	}
	error = zilog->zl_get_data(
	itx->itx_private, lrw, dbuf, lwb->lwb_zio);
	if (error == EIO) {
	txg_wait_synced(zilog->zl_dmu_pool, txg);
	return (lwb);
	}
	if (error != 0) {
	ASSERT(error == ENOENT \|\| error == EEXIST \|\|
	error == EALREADY);
	return (lwb);
	}
	}
	}

	/*
	* We're actually making an entry, so update lrc_seq to be the
	* log record sequence number. Note that this is generally not
	* equal to the itx sequence number because not all transactions
	* are synchronous, and sometimes spa_sync() gets there first.
	*/
	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
	lwb->lwb_nused += reclen + dlen;
	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));

	return (lwb);
	}

	itx_t *
	zil_itx_create(uint64_t txtype, size_t lrsize)
	{
	itx_t *itx;

	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);

	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
	itx->itx_lr.lrc_txtype = txtype;
	itx->itx_lr.lrc_reclen = lrsize;
	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
	itx->itx_lr.lrc_seq = 0; /* defensive */
	itx->itx_sync = B_TRUE; /* default is synchronous */

	return (itx);
	}

	void
	zil_itx_destroy(itx_t *itx)
	{
	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
	}

	/*
	* Free up the sync and async itxs. The itxs_t has already been detached
	* so no locks are needed.
	*/
	static void
	zil_itxg_clean(itxs_t *itxs)
	{
	itx_t *itx;
	list_t *list;
	avl_tree_t *t;
	void *cookie;
	itx_async_node_t *ian;

	list = &itxs->i_sync_list;
	while ((itx = list_head(list)) != NULL) {
	list_remove(list, itx);
	kmem_free(itx, offsetof(itx_t, itx_lr) +
	itx->itx_lr.lrc_reclen);
	}

	cookie = NULL;
	t = &itxs->i_async_tree;
	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
	list = &ian->ia_list;
	while ((itx = list_head(list)) != NULL) {
	list_remove(list, itx);
	kmem_free(itx, offsetof(itx_t, itx_lr) +
	itx->itx_lr.lrc_reclen);
	}
	list_destroy(list);
	kmem_free(ian, sizeof (itx_async_node_t));
	}
	avl_destroy(t);

	kmem_free(itxs, sizeof (itxs_t));
	}

	static int
	zil_aitx_compare(const void x1, const void x2)
	{
	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;

	if (o1 < o2)
	return (-1);
	if (o1 > o2)
	return (1);

	return (0);
	}

	/*
	* Remove all async itx with the given oid.
	*/
	static void
	zil_remove_async(zilog_t *zilog, uint64_t oid)
	{
	uint64_t otxg, txg;
	itx_async_node_t *ian;
	avl_tree_t *t;
	avl_index_t where;
	list_t clean_list;
	itx_t *itx;

	ASSERT(oid != 0);
	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));

	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	otxg = ZILTEST_TXG;
	else
	otxg = spa_last_synced_txg(zilog->zl_spa) + 1;

	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];

	mutex_enter(&itxg->itxg_lock);
	if (itxg->itxg_txg != txg) {
	mutex_exit(&itxg->itxg_lock);
	continue;
	}

	/*
	* Locate the object node and append its list.
	*/
	t = &itxg->itxg_itxs->i_async_tree;
	ian = avl_find(t, &oid, &where);
	if (ian != NULL)
	list_move_tail(&clean_list, &ian->ia_list);
	mutex_exit(&itxg->itxg_lock);
	}
	while ((itx = list_head(&clean_list)) != NULL) {
	list_remove(&clean_list, itx);
	kmem_free(itx, offsetof(itx_t, itx_lr) +
	itx->itx_lr.lrc_reclen);
	}
	list_destroy(&clean_list);
	}

	void
	zil_itx_assign(zilog_t zilog, itx_t itx, dmu_tx_t *tx)
	{
	uint64_t txg;
	itxg_t *itxg;
	itxs_t itxs, clean = NULL;

	/*
	* Object ids can be re-instantiated in the next txg so
	* remove any async transactions to avoid future leaks.
	* This can happen if a fsync occurs on the re-instantiated
	* object for a WR_INDIRECT or WR_NEED_COPY write, which gets
	* the new file data and flushes a write record for the old object.
	*/
	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
	zil_remove_async(zilog, itx->itx_oid);

	/*
	* Ensure the data of a renamed file is committed before the rename.
	*/
	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
	zil_async_to_sync(zilog, itx->itx_oid);

	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
	txg = ZILTEST_TXG;
	else
	txg = dmu_tx_get_txg(tx);

	itxg = &zilog->zl_itxg[txg & TXG_MASK];
	mutex_enter(&itxg->itxg_lock);
	itxs = itxg->itxg_itxs;
	if (itxg->itxg_txg != txg) {
	if (itxs != NULL) {
	/*
	* The zil_clean callback hasn't got around to cleaning
	* this itxg. Save the itxs for release below.
	* This should be rare.
	*/
	atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
	itxg->itxg_sod = 0;
	clean = itxg->itxg_itxs;
	}
	ASSERT(itxg->itxg_sod == 0);
	itxg->itxg_txg = txg;
	itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);

	list_create(&itxs->i_sync_list, sizeof (itx_t),
	offsetof(itx_t, itx_node));
	avl_create(&itxs->i_async_tree, zil_aitx_compare,
	sizeof (itx_async_node_t),
	offsetof(itx_async_node_t, ia_node));
	}
	if (itx->itx_sync) {
	list_insert_tail(&itxs->i_sync_list, itx);
	atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
	itxg->itxg_sod += itx->itx_sod;
	} else {
	avl_tree_t *t = &itxs->i_async_tree;
	uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
	itx_async_node_t *ian;
	avl_index_t where;

	ian = avl_find(t, &foid, &where);
	if (ian == NULL) {
	ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
	list_create(&ian->ia_list, sizeof (itx_t),
	offsetof(itx_t, itx_node));
	ian->ia_foid = foid;
	avl_insert(t, ian, where);
	}
	list_insert_tail(&ian->ia_list, itx);
	}

	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
	zilog_dirty(zilog, txg);
	mutex_exit(&itxg->itxg_lock);

	/* Release the old itxs now we've dropped the lock */
	if (clean != NULL)
	zil_itxg_clean(clean);
	}

	/*
	* If there are any in-memory intent log transactions which have now been
	* synced then start up a taskq to free them. We should only do this after we
	* have written out the uberblocks (i.e. txg has been comitted) so that
	* don't inadvertently clean out in-memory log records that would be required
	* by zil_commit().
	*/
	void
	zil_clean(zilog_t *zilog, uint64_t synced_txg)
	{
	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
	itxs_t *clean_me;

	mutex_enter(&itxg->itxg_lock);
	if (itxg->itxg_itxs == NULL \|\| itxg->itxg_txg == ZILTEST_TXG) {
	mutex_exit(&itxg->itxg_lock);
	return;
	}
	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
	ASSERT(itxg->itxg_txg != 0);
	ASSERT(zilog->zl_clean_taskq != NULL);
	atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
	itxg->itxg_sod = 0;
	clean_me = itxg->itxg_itxs;
	itxg->itxg_itxs = NULL;
	itxg->itxg_txg = 0;
	mutex_exit(&itxg->itxg_lock);
	/*
	* Preferably start a task queue to free up the old itxs but
	* if taskq_dispatch can't allocate resources to do that then
	* free it in-line. This should be rare. Note, using TQ_SLEEP
	* created a bad performance problem.
	*/
	if (taskq_dispatch(zilog->zl_clean_taskq,
	(void ()(void ))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL)
	zil_itxg_clean(clean_me);
	}

	/*
	* Get the list of itxs to commit into zl_itx_commit_list.
	*/
	static void
	zil_get_commit_list(zilog_t *zilog)
	{
	uint64_t otxg, txg;
	list_t *commit_list = &zilog->zl_itx_commit_list;
	uint64_t push_sod = 0;

	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	otxg = ZILTEST_TXG;
	else
	otxg = spa_last_synced_txg(zilog->zl_spa) + 1;

	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];

	mutex_enter(&itxg->itxg_lock);
	if (itxg->itxg_txg != txg) {
	mutex_exit(&itxg->itxg_lock);
	continue;
	}

	list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
	push_sod += itxg->itxg_sod;
	itxg->itxg_sod = 0;

	mutex_exit(&itxg->itxg_lock);
	}
	atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
	}

	/*
	* Move the async itxs for a specified object to commit into sync lists.
	*/
	static void
	zil_async_to_sync(zilog_t *zilog, uint64_t foid)
	{
	uint64_t otxg, txg;
	itx_async_node_t *ian;
	avl_tree_t *t;
	avl_index_t where;

	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
	otxg = ZILTEST_TXG;
	else
	otxg = spa_last_synced_txg(zilog->zl_spa) + 1;

	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
	itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];

	mutex_enter(&itxg->itxg_lock);
	if (itxg->itxg_txg != txg) {
	mutex_exit(&itxg->itxg_lock);
	continue;
	}

	/*
	* If a foid is specified then find that node and append its
	* list. Otherwise walk the tree appending all the lists
	* to the sync list. We add to the end rather than the
	* beginning to ensure the create has happened.
	*/
	t = &itxg->itxg_itxs->i_async_tree;
	if (foid != 0) {
	ian = avl_find(t, &foid, &where);
	if (ian != NULL) {
	list_move_tail(&itxg->itxg_itxs->i_sync_list,
	&ian->ia_list);
	}
	} else {
	void *cookie = NULL;

	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
	list_move_tail(&itxg->itxg_itxs->i_sync_list,
	&ian->ia_list);
	list_destroy(&ian->ia_list);
	kmem_free(ian, sizeof (itx_async_node_t));
	}
	}
	mutex_exit(&itxg->itxg_lock);
	}
	}

	static void
	zil_commit_writer(zilog_t *zilog)
	{
	uint64_t txg;
	itx_t *itx;
	lwb_t *lwb;
	spa_t *spa = zilog->zl_spa;
	int error = 0;

	ASSERT(zilog->zl_root_zio == NULL);

	mutex_exit(&zilog->zl_lock);

	zil_get_commit_list(zilog);

	/*
	* Return if there's nothing to commit before we dirty the fs by
	* calling zil_create().
	*/
	if (list_head(&zilog->zl_itx_commit_list) == NULL) {
	mutex_enter(&zilog->zl_lock);
	return;
	}

	if (zilog->zl_suspend) {
	lwb = NULL;
	} else {
	lwb = list_tail(&zilog->zl_lwb_list);
	if (lwb == NULL)
	lwb = zil_create(zilog);
	}

	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
	while (itx = list_head(&zilog->zl_itx_commit_list)) {
	txg = itx->itx_lr.lrc_txg;
	ASSERT(txg);

	if (txg > spa_last_synced_txg(spa) \|\| txg > spa_freeze_txg(spa))
	lwb = zil_lwb_commit(zilog, itx, lwb);
	list_remove(&zilog->zl_itx_commit_list, itx);
	kmem_free(itx, offsetof(itx_t, itx_lr)
	+ itx->itx_lr.lrc_reclen);
	}
	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);

	/* write the last block out */
	if (lwb != NULL && lwb->lwb_zio != NULL)
	lwb = zil_lwb_write_start(zilog, lwb);

	zilog->zl_cur_used = 0;

	/*
	* Wait if necessary for the log blocks to be on stable storage.
	*/
	if (zilog->zl_root_zio) {
	error = zio_wait(zilog->zl_root_zio);
	zilog->zl_root_zio = NULL;
	zil_flush_vdevs(zilog);
	}

	if (error \|\| lwb == NULL)
	txg_wait_synced(zilog->zl_dmu_pool, 0);

	mutex_enter(&zilog->zl_lock);

	/*
	* Remember the highest committed log sequence number for ztest.
	* We only update this value when all the log writes succeeded,
	* because ztest wants to ASSERT that it got the whole log chain.
	*/
	if (error == 0 && lwb != NULL)
	zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
	}

	/*
	* Commit zfs transactions to stable storage.
	* If foid is 0 push out all transactions, otherwise push only those
	* for that object or might reference that object.
	*
	* itxs are committed in batches. In a heavily stressed zil there will be
	* a commit writer thread who is writing out a bunch of itxs to the log
	* for a set of committing threads (cthreads) in the same batch as the writer.
	* Those cthreads are all waiting on the same cv for that batch.
	*
	* There will also be a different and growing batch of threads that are
	* waiting to commit (qthreads). When the committing batch completes
	* a transition occurs such that the cthreads exit and the qthreads become
	* cthreads. One of the new cthreads becomes the writer thread for the
	* batch. Any new threads arriving become new qthreads.
	*
	* Only 2 condition variables are needed and there's no transition
	* between the two cvs needed. They just flip-flop between qthreads
	* and cthreads.
	*
	* Using this scheme we can efficiently wakeup up only those threads
	* that have been committed.
	*/
	void
	zil_commit(zilog_t *zilog, uint64_t foid)
	{
	uint64_t mybatch;

	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
	return;

	/* move the async itxs for the foid to the sync queues */
	zil_async_to_sync(zilog, foid);

	mutex_enter(&zilog->zl_lock);
	mybatch = zilog->zl_next_batch;
	while (zilog->zl_writer) {
	cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
	if (mybatch <= zilog->zl_com_batch) {
	mutex_exit(&zilog->zl_lock);
	return;
	}
	}

	zilog->zl_next_batch++;
	zilog->zl_writer = B_TRUE;
	zil_commit_writer(zilog);
	zilog->zl_com_batch = mybatch;
	zilog->zl_writer = B_FALSE;
	mutex_exit(&zilog->zl_lock);

	/* wake up one thread to become the next writer */
	cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);

	/* wake up all threads waiting for this batch to be committed */
	cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
	}

	/*
	* Called in syncing context to free committed log blocks and update log header.
	*/
	void
	zil_sync(zilog_t zilog, dmu_tx_t tx)
	{
	zil_header_t *zh = zil_header_in_syncing_context(zilog);
	uint64_t txg = dmu_tx_get_txg(tx);
	spa_t *spa = zilog->zl_spa;
	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
	lwb_t *lwb;

	/*
	* We don't zero out zl_destroy_txg, so make sure we don't try
	* to destroy it twice.
	*/
	if (spa_sync_pass(spa) != 1)
	return;

	mutex_enter(&zilog->zl_lock);

	ASSERT(zilog->zl_stop_sync == 0);

	if (*replayed_seq != 0) {
	ASSERT(zh->zh_replay_seq < *replayed_seq);
	zh->zh_replay_seq = *replayed_seq;
	*replayed_seq = 0;
	}

	if (zilog->zl_destroy_txg == txg) {
	blkptr_t blk = zh->zh_log;

	ASSERT(list_head(&zilog->zl_lwb_list) == NULL);

	bzero(zh, sizeof (zil_header_t));
	bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));

	if (zilog->zl_keep_first) {
	/*
	* If this block was part of log chain that couldn't
	* be claimed because a device was missing during
	* zil_claim(), but that device later returns,
	* then this block could erroneously appear valid.
	* To guard against this, assign a new GUID to the new
	* log chain so it doesn't matter what blk points to.
	*/
	zil_init_log_chain(zilog, &blk);
	zh->zh_log = blk;
	}
	}

	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
	zh->zh_log = lwb->lwb_blk;
	if (lwb->lwb_buf != NULL \|\| lwb->lwb_max_txg > txg)
	break;
	list_remove(&zilog->zl_lwb_list, lwb);
	zio_free_zil(spa, txg, &lwb->lwb_blk);
	kmem_cache_free(zil_lwb_cache, lwb);

	/*
	* If we don't have anything left in the lwb list then
	* we've had an allocation failure and we need to zero
	* out the zil_header blkptr so that we don't end
	* up freeing the same block twice.
	*/
	if (list_head(&zilog->zl_lwb_list) == NULL)
	BP_ZERO(&zh->zh_log);
	}
	mutex_exit(&zilog->zl_lock);
	}

	void
	zil_init(void)
	{
	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
	sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
	}

	void
	zil_fini(void)
	{
	kmem_cache_destroy(zil_lwb_cache);
	}

	void
	zil_set_sync(zilog_t *zilog, uint64_t sync)
	{
	zilog->zl_sync = sync;
	}

	void
	zil_set_logbias(zilog_t *zilog, uint64_t logbias)
	{
	zilog->zl_logbias = logbias;
	}

	zilog_t *
	zil_alloc(objset_t os, zil_header_t zh_phys)
	{
	zilog_t *zilog;

	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);

	zilog->zl_header = zh_phys;
	zilog->zl_os = os;
	zilog->zl_spa = dmu_objset_spa(os);
	zilog->zl_dmu_pool = dmu_objset_pool(os);
	zilog->zl_destroy_txg = TXG_INITIAL - 1;
	zilog->zl_logbias = dmu_objset_logbias(os);
	zilog->zl_sync = dmu_objset_syncprop(os);
	zilog->zl_next_batch = 1;

	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);

	for (int i = 0; i < TXG_SIZE; i++) {
	mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
	MUTEX_DEFAULT, NULL);
	}

	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
	offsetof(lwb_t, lwb_node));

	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
	offsetof(itx_t, itx_node));

	mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);

	avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
	sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));

	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
	cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
	cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);

	return (zilog);
	}

	void
	zil_free(zilog_t *zilog)
	{
	zilog->zl_stop_sync = 1;

	ASSERT0(zilog->zl_suspend);
	ASSERT0(zilog->zl_suspending);

	ASSERT(list_is_empty(&zilog->zl_lwb_list));
	list_destroy(&zilog->zl_lwb_list);

	avl_destroy(&zilog->zl_vdev_tree);
	mutex_destroy(&zilog->zl_vdev_lock);

	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
	list_destroy(&zilog->zl_itx_commit_list);

	for (int i = 0; i < TXG_SIZE; i++) {
	/*
	* It's possible for an itx to be generated that doesn't dirty
	* a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
	* callback to remove the entry. We remove those here.
	*
	* Also free up the ziltest itxs.
	*/
	if (zilog->zl_itxg[i].itxg_itxs)
	zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
	mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
	}

	mutex_destroy(&zilog->zl_lock);

	cv_destroy(&zilog->zl_cv_writer);
	cv_destroy(&zilog->zl_cv_suspend);
	cv_destroy(&zilog->zl_cv_batch[0]);
	cv_destroy(&zilog->zl_cv_batch[1]);

	kmem_free(zilog, sizeof (zilog_t));
	}

	/*
	* Open an intent log.
	*/
	zilog_t *
	zil_open(objset_t os, zil_get_data_t get_data)
	{
	zilog_t *zilog = dmu_objset_zil(os);

	ASSERT(zilog->zl_clean_taskq == NULL);
	ASSERT(zilog->zl_get_data == NULL);
	ASSERT(list_is_empty(&zilog->zl_lwb_list));

	zilog->zl_get_data = get_data;
	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
	2, 2, TASKQ_PREPOPULATE);

	return (zilog);
	}

	/*
	* Close an intent log.
	*/
	void
	zil_close(zilog_t *zilog)
	{
	lwb_t *lwb;
	uint64_t txg = 0;

	zil_commit(zilog, 0); /* commit all itx */

	/*
	* The lwb_max_txg for the stubby lwb will reflect the last activity
	* for the zil. After a txg_wait_synced() on the txg we know all the
	* callbacks have occurred that may clean the zil. Only then can we
	* destroy the zl_clean_taskq.
	*/
	mutex_enter(&zilog->zl_lock);
	lwb = list_tail(&zilog->zl_lwb_list);
	if (lwb != NULL)
	txg = lwb->lwb_max_txg;
	mutex_exit(&zilog->zl_lock);
	if (txg)
	txg_wait_synced(zilog->zl_dmu_pool, txg);
	ASSERT(!zilog_is_dirty(zilog));

	taskq_destroy(zilog->zl_clean_taskq);
	zilog->zl_clean_taskq = NULL;
	zilog->zl_get_data = NULL;

	/*
	* We should have only one LWB left on the list; remove it now.
	*/
	mutex_enter(&zilog->zl_lock);
	lwb = list_head(&zilog->zl_lwb_list);
	if (lwb != NULL) {
	ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
	list_remove(&zilog->zl_lwb_list, lwb);
	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
	kmem_cache_free(zil_lwb_cache, lwb);
	}
	mutex_exit(&zilog->zl_lock);
	}

	static char *suspend_tag = "zil suspending";

	/*
	* Suspend an intent log. While in suspended mode, we still honor
	* synchronous semantics, but we rely on txg_wait_synced() to do it.
	* On old version pools, we suspend the log briefly when taking a
	* snapshot so that it will have an empty intent log.
	*
	* Long holds are not really intended to be used the way we do here --
	* held for such a short time. A concurrent caller of dsl_dataset_long_held()
	* could fail. Therefore we take pains to only put a long hold if it is
	* actually necessary. Fortunately, it will only be necessary if the
	* objset is currently mounted (or the ZVOL equivalent). In that case it
	* will already have a long hold, so we are not really making things any worse.
	*
	* Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
	* zvol_state_t), and use their mechanism to prevent their hold from being
	* dropped (e.g. VFS_HOLD()). However, that would be even more pain for
	* very little gain.
	*
	* if cookiep == NULL, this does both the suspend & resume.
	* Otherwise, it returns with the dataset "long held", and the cookie
	* should be passed into zil_resume().
	*/
	int
	zil_suspend(const char osname, void *cookiep)
	{
	objset_t *os;
	zilog_t *zilog;
	const zil_header_t *zh;
	int error;

	error = dmu_objset_hold(osname, suspend_tag, &os);
	if (error != 0)
	return (error);
	zilog = dmu_objset_zil(os);

	mutex_enter(&zilog->zl_lock);
	zh = zilog->zl_header;

	if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
	mutex_exit(&zilog->zl_lock);
	dmu_objset_rele(os, suspend_tag);
	return (SET_ERROR(EBUSY));
	}

	/*
	* Don't put a long hold in the cases where we can avoid it. This
	* is when there is no cookie so we are doing a suspend & resume
	* (i.e. called from zil_vdev_offline()), and there's nothing to do
	* for the suspend because it's already suspended, or there's no ZIL.
	*/
	if (cookiep == NULL && !zilog->zl_suspending &&
	(zilog->zl_suspend > 0 \|\| BP_IS_HOLE(&zh->zh_log))) {
	mutex_exit(&zilog->zl_lock);
	dmu_objset_rele(os, suspend_tag);
	return (0);
	}

	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);

	zilog->zl_suspend++;

	if (zilog->zl_suspend > 1) {
	/*
	* Someone else is already suspending it.
	* Just wait for them to finish.
	*/

	while (zilog->zl_suspending)
	cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
	mutex_exit(&zilog->zl_lock);

	if (cookiep == NULL)
	zil_resume(os);
	else
	*cookiep = os;
	return (0);
	}

	/*
	* If there is no pointer to an on-disk block, this ZIL must not
	* be active (e.g. filesystem not mounted), so there's nothing
	* to clean up.
	*/
	if (BP_IS_HOLE(&zh->zh_log)) {
	ASSERT(cookiep != NULL); /* fast path already handled */

	*cookiep = os;
	mutex_exit(&zilog->zl_lock);
	return (0);
	}

	zilog->zl_suspending = B_TRUE;
	mutex_exit(&zilog->zl_lock);

	zil_commit(zilog, 0);

	zil_destroy(zilog, B_FALSE);

	mutex_enter(&zilog->zl_lock);
	zilog->zl_suspending = B_FALSE;
	cv_broadcast(&zilog->zl_cv_suspend);
	mutex_exit(&zilog->zl_lock);

	if (cookiep == NULL)
	zil_resume(os);
	else
	*cookiep = os;
	return (0);
	}

	void
	zil_resume(void *cookie)
	{
	objset_t *os = cookie;
	zilog_t *zilog = dmu_objset_zil(os);

	mutex_enter(&zilog->zl_lock);
	ASSERT(zilog->zl_suspend != 0);
	zilog->zl_suspend--;
	mutex_exit(&zilog->zl_lock);
	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
	}

	typedef struct zil_replay_arg {
	zil_replay_func_t **zr_replay;
	void *zr_arg;
	boolean_t zr_byteswap;
	char *zr_lr;
	} zil_replay_arg_t;

	static int
	zil_replay_error(zilog_t zilog, lr_t lr, int error)
	{
	char name[MAXNAMELEN];

	zilog->zl_replaying_seq--; /* didn't actually replay this one */

	dmu_objset_name(zilog->zl_os, name);

	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
	"dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
	(u_longlong_t)lr->lrc_seq,
	(u_longlong_t)(lr->lrc_txtype & ~TX_CI),
	(lr->lrc_txtype & TX_CI) ? "CI" : "");

	return (error);
	}

	static int
	zil_replay_log_record(zilog_t zilog, lr_t lr, void *zra, uint64_t claim_txg)
	{
	zil_replay_arg_t *zr = zra;
	const zil_header_t *zh = zilog->zl_header;
	uint64_t reclen = lr->lrc_reclen;
	uint64_t txtype = lr->lrc_txtype;
	int error = 0;

	zilog->zl_replaying_seq = lr->lrc_seq;

	if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
	return (0);

	if (lr->lrc_txg < claim_txg) /* already committed */
	return (0);

	/* Strip case-insensitive bit, still present in log record */
	txtype &= ~TX_CI;

	if (txtype == 0 \|\| txtype >= TX_MAX_TYPE)
	return (zil_replay_error(zilog, lr, EINVAL));

	/*
	* If this record type can be logged out of order, the object
	* (lr_foid) may no longer exist. That's legitimate, not an error.
	*/
	if (TX_OOO(txtype)) {
	error = dmu_object_info(zilog->zl_os,
	((lr_ooo_t *)lr)->lr_foid, NULL);
	if (error == ENOENT \|\| error == EEXIST)
	return (0);
	}

	/*
	* Make a copy of the data so we can revise and extend it.
	*/
	bcopy(lr, zr->zr_lr, reclen);

	/*
	* If this is a TX_WRITE with a blkptr, suck in the data.
	*/
	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
	error = zil_read_log_data(zilog, (lr_write_t *)lr,
	zr->zr_lr + reclen);
	if (error != 0)
	return (zil_replay_error(zilog, lr, error));
	}

	/*
	* The log block containing this lr may have been byteswapped
	* so that we can easily examine common fields like lrc_txtype.
	* However, the log is a mix of different record types, and only the
	* replay vectors know how to byteswap their records. Therefore, if
	* the lr was byteswapped, undo it before invoking the replay vector.
	*/
	if (zr->zr_byteswap)
	byteswap_uint64_array(zr->zr_lr, reclen);

	/*
	* We must now do two things atomically: replay this log record,
	* and update the log header sequence number to reflect the fact that
	* we did so. At the end of each replay function the sequence number
	* is updated if we are in replay mode.
	*/
	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
	if (error != 0) {
	/*
	* The DMU's dnode layer doesn't see removes until the txg
	* commits, so a subsequent claim can spuriously fail with
	* EEXIST. So if we receive any error we try syncing out
	* any removes then retry the transaction. Note that we
	* specify B_FALSE for byteswap now, so we don't do it twice.
	*/
	txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
	if (error != 0)
	return (zil_replay_error(zilog, lr, error));
	}
	return (0);
	}

	/* ARGSUSED */
	static int
	zil_incr_blks(zilog_t zilog, blkptr_t bp, void *arg, uint64_t claim_txg)
	{
	zilog->zl_replay_blks++;

	return (0);
	}

	/*
	* If this dataset has a non-empty intent log, replay it and destroy it.
	*/
	void
	zil_replay(objset_t os, void arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
	{
	zilog_t *zilog = dmu_objset_zil(os);
	const zil_header_t *zh = zilog->zl_header;
	zil_replay_arg_t zr;

	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
	zil_destroy(zilog, B_TRUE);
	return;
	}

	zr.zr_replay = replay_func;
	zr.zr_arg = arg;
	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);

	/*
	* Wait for in-progress removes to sync before starting replay.
	*/
	txg_wait_synced(zilog->zl_dmu_pool, 0);

	zilog->zl_replay = B_TRUE;
	zilog->zl_replay_time = ddi_get_lbolt();
	ASSERT(zilog->zl_replay_blks == 0);
	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
	zh->zh_claim_txg);
	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);

	zil_destroy(zilog, B_FALSE);
	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
	zilog->zl_replay = B_FALSE;
	}

	boolean_t
	zil_replaying(zilog_t zilog, dmu_tx_t tx)
	{
	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
	return (B_TRUE);

	if (zilog->zl_replay) {
	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
	zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
	zilog->zl_replaying_seq;
	return (B_TRUE);
	}

	return (B_FALSE);
	}

	/* ARGSUSED */
	int
	zil_vdev_offline(const char osname, void arg)
	{
	int error;

	error = zil_suspend(osname, NULL);
	if (error != 0)
	return (SET_ERROR(EEXIST));
	return (0);
	}

File Metadata

Mime Type: text/x-diff
Expires: Sun, Jul 5, 9:19 AM (1 d, 13 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 29011937
Default Alt Text: (487 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions