ahrens commented on this pull request.


> + */
+typedef enum {
+       SPA_FORCE_TRIM_OFF = 0, /* default */
+       SPA_FORCE_TRIM_ON
+} spa_force_trim_t;
+
+/*
+ * Should we send TRIM commands in-line during normal pool operation while
+ * deleting stuff?
+ *     OFF: no
+ *     ON: yes
+ */
+typedef enum {
+       SPA_AUTO_TRIM_OFF = 0,  /* default */
+       SPA_AUTO_TRIM_ON
+} spa_auto_trim_t;

What does `auto_trim=on` do if none of the disks claim to support TRIM?  Same 
goes for `zpool trim`.

What impact is there of sending the TRIM command to a device that doesn't claim 
to support it?

Rather than separate `force_trim` and `auto_trim` properties, I think it would 
be easier to understand as:
`auto_trim=off|auto|force` plus the `zpool trim` command could have a `--force` 
flag.

> +
+       /*
+        * Another pool management task might be currently prevented from
+        * starting and the current txg sync was invoked on its behalf,
+        * so be prepared to postpone autotrim processing.
+        */
+       if (!mutex_tryenter(&spa->spa_auto_trim_lock))
+               return;
+       spa->spa_num_auto_trimming += spa->spa_root_vdev->vdev_children;
+       mutex_exit(&spa->spa_auto_trim_lock);
+
+       for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+               vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP);
+               vti->vti_vdev = spa->spa_root_vdev->vdev_child[i];
+               vti->vti_txg = txg;
+               vti->vti_done_cb = (void (*)(void *))spa_vdev_auto_trim_done;

since the function (spa_vdev_auto_trim_done) and member (vti_done_cb) are both 
new, can we declare their types such that the cast here is not necessary?

> +      * starting and the current txg sync was invoked on its behalf,
+        * so be prepared to postpone autotrim processing.
+        */
+       if (!mutex_tryenter(&spa->spa_auto_trim_lock))
+               return;
+       spa->spa_num_auto_trimming += spa->spa_root_vdev->vdev_children;
+       mutex_exit(&spa->spa_auto_trim_lock);
+
+       for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+               vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP);
+               vti->vti_vdev = spa->spa_root_vdev->vdev_child[i];
+               vti->vti_txg = txg;
+               vti->vti_done_cb = (void (*)(void *))spa_vdev_auto_trim_done;
+               vti->vti_done_arg = spa;
+               (void) taskq_dispatch(spa->spa_auto_trim_taskq,
+                   (void (*)(void *))vdev_auto_trim, vti, TQ_SLEEP);

same here

> +              * threads because the trim rate might have changed above.
+                */
+               cv_broadcast(&spa->spa_man_trim_update_cv);
+               mutex_exit(&spa->spa_man_trim_lock);
+               return;
+       }
+       spa_man_trim_taskq_create(spa);
+       spa->spa_man_trim_stop = B_FALSE;
+
+       spa_event_notify(spa, NULL, ESC_ZFS_TRIM_START);
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+       for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+               vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+               vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP);
+               vti->vti_vdev = vd;
+               vti->vti_done_cb = (void (*)(void *))spa_vdev_man_trim_done;

same here with the casts.

> +     for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+               vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+               vdev_trim_info_t *vti = kmem_zalloc(sizeof (*vti), KM_SLEEP);
+               vti->vti_vdev = vd;
+               vti->vti_done_cb = (void (*)(void *))spa_vdev_man_trim_done;
+               vti->vti_done_arg = spa;
+               spa->spa_num_man_trimming++;
+
+               vd->vdev_trim_prog = 0;
+               (void) taskq_dispatch(spa->spa_man_trim_taskq,
+                   (void (*)(void *))vdev_man_trim, vti, TQ_SLEEP);
+       }
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+       time_update_tx = spa_trim_update_time(spa, gethrestime_sec(), 0);
+       mutex_exit(&spa->spa_man_trim_lock);
+       /* mustn't hold spa_man_trim_lock to prevent deadlock /w syncing ctx */

Can you explain that in more detail?  dmu_tx_commit() should never block for 
very long, so I'm not sure why you need to drop the spa_man_trim_lock before 
calling it.

> +     mutex_enter(&spa->spa_man_trim_lock);
+       ASSERT(spa->spa_num_man_trimming > 0);
+       spa->spa_num_man_trimming--;
+       if (spa->spa_num_man_trimming == 0) {
+               /* if we were interrupted, leave stop_time at zero */
+               if (!spa->spa_man_trim_stop)
+                       time_update_tx = spa_trim_update_time(spa, UINT64_MAX,
+                           gethrestime_sec());
+               spa_event_notify(spa, NULL, ESC_ZFS_TRIM_FINISH);
+               spa_async_request(spa, SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY);
+               cv_broadcast(&spa->spa_man_trim_done_cv);
+       }
+       mutex_exit(&spa->spa_man_trim_lock);
+
+       if (time_update_tx != NULL)
+               dmu_tx_commit(time_update_tx);

same question about why we can't do the dmu_tx_commit() from 
spa_trim_update_time()

> +     ASSERT(spa->spa_num_auto_trimming > 0);
+       spa->spa_num_auto_trimming--;
+       if (spa->spa_num_auto_trimming == 0)
+               cv_broadcast(&spa->spa_auto_trim_done_cv);
+       mutex_exit(&spa->spa_auto_trim_lock);
+}
+
+/*
+ * Determines the minimum sensible rate at which a manual TRIM can be
+ * performed on a given spa and returns it. Since we perform TRIM in
+ * metaslab-sized increments, we'll just let the longest step between
+ * metaslab TRIMs be 100s (random number, really). Thus, on a typical
+ * 200-metaslab vdev, the longest TRIM should take is about 5.5 hours.
+ * It *can* take longer if the device is really slow respond to
+ * zio_trim() commands or it contains more than 200 metaslabs, or
+ * metaslab sizes vary widely between top-level vdevs.

Can you explain the rationale behind this?  Is the point that something happens 
for every metaslab and if that were to take more than 100 seconds that would be 
bad?

Also please specify the units of the value returned (I think it's bytes per 
second).

-- 
You are receiving this because you are subscribed to this thread.
Reply to this email directly or view it on GitHub:
https://github.com/openzfs/openzfs/pull/172#pullrequestreview-37722754
------------------------------------------
openzfs-developer
Archives: https://openzfs.topicbox.com/groups/developer/
Powered by Topicbox: https://topicbox.com

Reply via email to