jaydoane commented on code in PR #5014: URL: https://github.com/apache/couchdb/pull/5014#discussion_r1554730598
########## src/couch_scanner/README.md: ########## @@ -0,0 +1,107 @@ +Couch Scanner +============= + +Couch Scanner is an application which traverses all the dbs and docs in the +background and emits various reports. There is a common traversal mechanism +which finds all the dbs, design docs, docs and calls various reporting plugins. +Reporting plugins implement a common API and may search for various strings in +the doc bodies, examine design docs for certain features, check index sizes, or +issue any other reports. + +Two plugins are initially included with the application are: + * [couch_scanner_plugin_find](src/couch_scanner_plugin_find.erl) : Find + occurrences of any regular expressions in the cluster. It scans through + document bodies, db name and doc ids. This can be used to, for instance, to Review Comment: > This can be used to, for instance, to Maybe drop the first "to"? ########## src/couch_scanner/README.md: ########## @@ -0,0 +1,107 @@ +Couch Scanner +============= + +Couch Scanner is an application which traverses all the dbs and docs in the +background and emits various reports. There is a common traversal mechanism +which finds all the dbs, design docs, docs and calls various reporting plugins. +Reporting plugins implement a common API and may search for various strings in +the doc bodies, examine design docs for certain features, check index sizes, or +issue any other reports. + +Two plugins are initially included with the application are: + * [couch_scanner_plugin_find](src/couch_scanner_plugin_find.erl) : Find + occurrences of any regular expressions in the cluster. It scans through + document bodies, db name and doc ids. This can be used to, for instance, to + search accidentally leaked secrets (API keys, passwords). + * [couch_scanner_ddoc_features](src/couch_scanner_ddoc_features.erl) : Report + on various features used by design docs. By default it will report features + which will be deprecated in CouchDB 4.x such as shows, lists, rewrites, + etc. But also can be configured to search for javascript filter definition, + custom javascript reduce functions and a few other things. + +By default no plugins are enabled so the scanner application won't do anything. +Plugins can be enabled in the configuration system by setting: + +For instance, to enable `couch_scanner_ddoc_features` plugin use: + +``` +[couch_scanner_plugins] +couch_scanner_plugin_ddoc_features = true +``` + +If a node is put in maintenance mode all the plugins will be automatically +stopped on that node. When node is put back in production plugins will +automatically resume executing. It's also possible to pause plugin execution in +a remsh using the `couch_scanner:stop()` and then resume it later with +`couch_scanner:resume()`. + +#### Application Structure + +Top level application API is in the [couch_scanner](src/couch_scanner.erl) module. + + * status() -> return running status of plugins + * stop() -> stop running plugins on this node + * resume() -> resume running plugins on this node + * checkpoints() -> inspect the value of all the node local checkpoints + * reset_checkpoints() -> delete all the node local checkpoints + +##### couch_scanner_server + +Plugins are run as individual processes. These processes are managed by +[couch_scanner_server](src/couch_scanner_server.erl) gen_server. The gen_server +inspects `couch_scanner_plugins` config section and then starts a new plugin +process for each configured plugin. Then it waits for the process to exit. When +the process exits it may exit normally, crash with an error, or indicate that +it should be rescheduled to run later. Later rescheduling is indicated by +exiting with a `{shutdown, {reschedule, UnixTimeSec}}` exit value. If the +plugin crashes repeatedly it will be penalized with an exponential back-off +starting at 30 seconds and up to 8 hours. + +##### couch_scanner_plugin + +Plugin processes are proc_lib processes spawned by the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module. After spawning the +processes will read the previously saved checkpoint from a `_local` checkpoint +doc in the `_dbs` db and start traversing databases from the last checkpoint. + +There is a plugin processes running for each configured plugin. On startup, +during traversal, checkpointing and before exiting it will call into the +corresponding API function. This works very much like the gen_server pattern. +The plugin API behavior is defined in the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module as `-callback` +directives. Plugin modules then refer to it as +`-behavior(couch_scanner_plugin)`. + +Periodically it will checkpoint the last database it processed so if the node +crashes or is stopped it will continue processing where it left off. + +During startup or when finished the plugin may exit with a +`{shutdown,{reschedule, UnixTimeSec}}` exit message to indicate that it wants +to be scheduled to run at a later time. + +##### couch_scanner_checkpoint + +Plugins periodically will checkpoint their database traversal progress to a +`_local` checkpoint doc in the `_dbs` database. Each plugin has their separate +checkpoint document. Plugin modules may implement the optional `checkpoint/1` +API and save some plugin specific data alongside the database traversal +checkpoint which gets saved automatically. For instance, it maybe useful for +plugins to save their start up configuration to detect when it changes so they +could restart their scanning. Or, it they want to accumulate some statistics +and only emit them at the end of the scan. + +Reading and writing to checkpoints is done in the +[couch_scanner_checkpoint](src/couch_scanner_checkpoint .erl) module. For +debugging or operational intervention there is a reset/0 call to delete all +the checkpoints on the local node. + +##### couch_scanner_rate_limiter + +[couch_scanner_rate_limiter](src/couch_scanner_rate_limiter.erl). To limit +plugin resource (CPU / IO) usage there is a rate limiting mechanism to ensure +all plugins can only open so many dbs and docs per second. +[couch_scanner_server](src/couch_scanner_server.erl) creates a shared token +bucket as an Erlang atomics array and periodically fills it with "tokens". +Plugin consume tokens every time they process a db or document. If they use up Review Comment: s/Plugin/Plugins/ ? ########## rel/overlay/etc/default.ini: ########## @@ -930,3 +930,99 @@ url = {{nouveau_url}} ;max_objects = 10000 ;max_idle = 600000 ;enable = true + +[couch_scanner] +; How often to check for configuration changes and start/stop plugins +;interval_sec = 5 + +; Minimum time to force a plugin to wait before running again after a crash +;min_penalty_sec = 30 + +; Maximum time to force a plugin to wait after repeated crashes (8 hours default) +;max_penalty_sec = 28800 + +; If plugin runs successfully without crashing for this long, reset its +; repeated error count +;heal_threshold_sec = 300 + +; Database processing rate limit per second. This will also be the +; rate at which design documents are fetched. The rate is shared +; across all running plugins. Review Comment: It's hard to know if it's necessary without seeing how it works in production, but a simple dedicated gen_server that just refills tokens might be easier to reason about. ########## src/couch_scanner/README.md: ########## @@ -0,0 +1,107 @@ +Couch Scanner +============= + +Couch Scanner is an application which traverses all the dbs and docs in the +background and emits various reports. There is a common traversal mechanism +which finds all the dbs, design docs, docs and calls various reporting plugins. +Reporting plugins implement a common API and may search for various strings in +the doc bodies, examine design docs for certain features, check index sizes, or +issue any other reports. + +Two plugins are initially included with the application are: + * [couch_scanner_plugin_find](src/couch_scanner_plugin_find.erl) : Find + occurrences of any regular expressions in the cluster. It scans through + document bodies, db name and doc ids. This can be used to, for instance, to + search accidentally leaked secrets (API keys, passwords). + * [couch_scanner_ddoc_features](src/couch_scanner_ddoc_features.erl) : Report + on various features used by design docs. By default it will report features + which will be deprecated in CouchDB 4.x such as shows, lists, rewrites, + etc. But also can be configured to search for javascript filter definition, + custom javascript reduce functions and a few other things. + +By default no plugins are enabled so the scanner application won't do anything. +Plugins can be enabled in the configuration system by setting: + +For instance, to enable `couch_scanner_ddoc_features` plugin use: + +``` +[couch_scanner_plugins] +couch_scanner_plugin_ddoc_features = true +``` + +If a node is put in maintenance mode all the plugins will be automatically +stopped on that node. When node is put back in production plugins will +automatically resume executing. It's also possible to pause plugin execution in +a remsh using the `couch_scanner:stop()` and then resume it later with +`couch_scanner:resume()`. + +#### Application Structure + +Top level application API is in the [couch_scanner](src/couch_scanner.erl) module. + + * status() -> return running status of plugins + * stop() -> stop running plugins on this node + * resume() -> resume running plugins on this node + * checkpoints() -> inspect the value of all the node local checkpoints + * reset_checkpoints() -> delete all the node local checkpoints + +##### couch_scanner_server + +Plugins are run as individual processes. These processes are managed by +[couch_scanner_server](src/couch_scanner_server.erl) gen_server. The gen_server +inspects `couch_scanner_plugins` config section and then starts a new plugin +process for each configured plugin. Then it waits for the process to exit. When +the process exits it may exit normally, crash with an error, or indicate that +it should be rescheduled to run later. Later rescheduling is indicated by +exiting with a `{shutdown, {reschedule, UnixTimeSec}}` exit value. If the +plugin crashes repeatedly it will be penalized with an exponential back-off +starting at 30 seconds and up to 8 hours. + +##### couch_scanner_plugin + +Plugin processes are proc_lib processes spawned by the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module. After spawning the +processes will read the previously saved checkpoint from a `_local` checkpoint +doc in the `_dbs` db and start traversing databases from the last checkpoint. + +There is a plugin processes running for each configured plugin. On startup, +during traversal, checkpointing and before exiting it will call into the +corresponding API function. This works very much like the gen_server pattern. +The plugin API behavior is defined in the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module as `-callback` +directives. Plugin modules then refer to it as +`-behavior(couch_scanner_plugin)`. + +Periodically it will checkpoint the last database it processed so if the node +crashes or is stopped it will continue processing where it left off. + +During startup or when finished the plugin may exit with a +`{shutdown,{reschedule, UnixTimeSec}}` exit message to indicate that it wants +to be scheduled to run at a later time. + +##### couch_scanner_checkpoint + +Plugins periodically will checkpoint their database traversal progress to a +`_local` checkpoint doc in the `_dbs` database. Each plugin has their separate +checkpoint document. Plugin modules may implement the optional `checkpoint/1` +API and save some plugin specific data alongside the database traversal +checkpoint which gets saved automatically. For instance, it maybe useful for +plugins to save their start up configuration to detect when it changes so they +could restart their scanning. Or, it they want to accumulate some statistics +and only emit them at the end of the scan. + +Reading and writing to checkpoints is done in the +[couch_scanner_checkpoint](src/couch_scanner_checkpoint .erl) module. For +debugging or operational intervention there is a reset/0 call to delete all +the checkpoints on the local node. + +##### couch_scanner_rate_limiter + +[couch_scanner_rate_limiter](src/couch_scanner_rate_limiter.erl). To limit +plugin resource (CPU / IO) usage there is a rate limiting mechanism to ensure +all plugins can only open so many dbs and docs per second. +[couch_scanner_server](src/couch_scanner_server.erl) creates a shared token +bucket as an Erlang atomics array and periodically fills it with "tokens". +Plugin consume tokens every time they process a db or document. If they use up +all the tokens, they start to back-off exponentially. It's a simple AIMD Review Comment: Perhaps spell out what AIMD stands for, or perhaps link to a definition like https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease ? EDIT: Ah, nm; I see you did that in the rate limiter module. ########## src/couch_scanner/src/couch_scanner_plugin.erl: ########## @@ -0,0 +1,662 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +% Scanner plugin runner process +% +% This is the process which is spawned and run for each enabled plugin. +% +% A number of these processes are managed by the couch_scanner_server via +% start_link/1 and complete/1 functions. After a plugin runner is spawned, the only +% thing couch_scanner_server does is wait for it to exit. +% +% The plugin runner process may exit normally, crash, or exit with {shutdown, +% {reschedule, TSec}} if they want to reschedule to run again at some point the +% future (next day, a week later, etc). +% +% After the process starts, it will load and validate the plugin module. Then, +% it will start scanning all the dbs and docs on the local node. Shard ranges +% will be scanned only on one of the cluster nodes to avoid duplicating work. +% For instance, if there are 2 shard ranges, 0-7, 8-f, with copies on nodes n1, +% n2, n3. Then, 0-7 might be scanned on n1 only, and 8-f on n3. +% +% The plugin API defined in the behavior definition section. +% +% The start/2 function is called when the plugin starts running. It returns +% some context (St), which can be any Erlang term. All subsequent function +% calls will be called with the same St object, and may return an updated +% version of it. +% +% If the plugin hasn't finished runing and has resumed running after the node Review Comment: maybe replace "finished runing [sic]" with "completed"? ########## src/couch_scanner/README.md: ########## @@ -0,0 +1,107 @@ +Couch Scanner +============= + +Couch Scanner is an application which traverses all the dbs and docs in the +background and emits various reports. There is a common traversal mechanism +which finds all the dbs, design docs, docs and calls various reporting plugins. +Reporting plugins implement a common API and may search for various strings in +the doc bodies, examine design docs for certain features, check index sizes, or +issue any other reports. + +Two plugins are initially included with the application are: + * [couch_scanner_plugin_find](src/couch_scanner_plugin_find.erl) : Find + occurrences of any regular expressions in the cluster. It scans through + document bodies, db name and doc ids. This can be used to, for instance, to + search accidentally leaked secrets (API keys, passwords). + * [couch_scanner_ddoc_features](src/couch_scanner_ddoc_features.erl) : Report + on various features used by design docs. By default it will report features + which will be deprecated in CouchDB 4.x such as shows, lists, rewrites, + etc. But also can be configured to search for javascript filter definition, + custom javascript reduce functions and a few other things. + +By default no plugins are enabled so the scanner application won't do anything. +Plugins can be enabled in the configuration system by setting: + +For instance, to enable `couch_scanner_ddoc_features` plugin use: + +``` +[couch_scanner_plugins] +couch_scanner_plugin_ddoc_features = true +``` + +If a node is put in maintenance mode all the plugins will be automatically +stopped on that node. When node is put back in production plugins will +automatically resume executing. It's also possible to pause plugin execution in +a remsh using the `couch_scanner:stop()` and then resume it later with +`couch_scanner:resume()`. + +#### Application Structure + +Top level application API is in the [couch_scanner](src/couch_scanner.erl) module. + + * status() -> return running status of plugins + * stop() -> stop running plugins on this node + * resume() -> resume running plugins on this node + * checkpoints() -> inspect the value of all the node local checkpoints + * reset_checkpoints() -> delete all the node local checkpoints + +##### couch_scanner_server + +Plugins are run as individual processes. These processes are managed by +[couch_scanner_server](src/couch_scanner_server.erl) gen_server. The gen_server +inspects `couch_scanner_plugins` config section and then starts a new plugin +process for each configured plugin. Then it waits for the process to exit. When +the process exits it may exit normally, crash with an error, or indicate that +it should be rescheduled to run later. Later rescheduling is indicated by +exiting with a `{shutdown, {reschedule, UnixTimeSec}}` exit value. If the +plugin crashes repeatedly it will be penalized with an exponential back-off +starting at 30 seconds and up to 8 hours. + +##### couch_scanner_plugin + +Plugin processes are proc_lib processes spawned by the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module. After spawning the +processes will read the previously saved checkpoint from a `_local` checkpoint +doc in the `_dbs` db and start traversing databases from the last checkpoint. + +There is a plugin processes running for each configured plugin. On startup, +during traversal, checkpointing and before exiting it will call into the +corresponding API function. This works very much like the gen_server pattern. +The plugin API behavior is defined in the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module as `-callback` +directives. Plugin modules then refer to it as +`-behavior(couch_scanner_plugin)`. + +Periodically it will checkpoint the last database it processed so if the node +crashes or is stopped it will continue processing where it left off. + +During startup or when finished the plugin may exit with a +`{shutdown,{reschedule, UnixTimeSec}}` exit message to indicate that it wants +to be scheduled to run at a later time. + +##### couch_scanner_checkpoint + +Plugins periodically will checkpoint their database traversal progress to a +`_local` checkpoint doc in the `_dbs` database. Each plugin has their separate +checkpoint document. Plugin modules may implement the optional `checkpoint/1` +API and save some plugin specific data alongside the database traversal +checkpoint which gets saved automatically. For instance, it maybe useful for +plugins to save their start up configuration to detect when it changes so they +could restart their scanning. Or, it they want to accumulate some statistics +and only emit them at the end of the scan. + +Reading and writing to checkpoints is done in the +[couch_scanner_checkpoint](src/couch_scanner_checkpoint .erl) module. For +debugging or operational intervention there is a reset/0 call to delete all Review Comment: Earlier you refer to `checkpoint/1` (in backslashes), so maybe use them here too for reset/0? ########## src/couch_scanner/src/couch_scanner_plugin.erl: ########## @@ -0,0 +1,662 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +% Scanner plugin runner process +% +% This is the process which is spawned and run for each enabled plugin. +% +% A number of these processes are managed by the couch_scanner_server via +% start_link/1 and complete/1 functions. After a plugin runner is spawned, the only +% thing couch_scanner_server does is wait for it to exit. +% +% The plugin runner process may exit normally, crash, or exit with {shutdown, +% {reschedule, TSec}} if they want to reschedule to run again at some point the +% future (next day, a week later, etc). +% +% After the process starts, it will load and validate the plugin module. Then, +% it will start scanning all the dbs and docs on the local node. Shard ranges +% will be scanned only on one of the cluster nodes to avoid duplicating work. +% For instance, if there are 2 shard ranges, 0-7, 8-f, with copies on nodes n1, +% n2, n3. Then, 0-7 might be scanned on n1 only, and 8-f on n3. +% +% The plugin API defined in the behavior definition section. +% +% The start/2 function is called when the plugin starts running. It returns +% some context (St), which can be any Erlang term. All subsequent function +% calls will be called with the same St object, and may return an updated +% version of it. +% +% If the plugin hasn't finished runing and has resumed running after the node +% was restarted or an error happened, the resume/2 function will be called. +% That's the difference between start and resume: start/2 is called when the +% scan starts from the beginning (first db, first shard, ...), and resume/2 is +% called when the scanning hasn't finished and has to continue. +% +% If start/2 or resume/2 returns `reset` then the checkpoint will be reset and Review Comment: This is a nice feature! ########## src/couch_scanner/README.md: ########## @@ -0,0 +1,107 @@ +Couch Scanner +============= + +Couch Scanner is an application which traverses all the dbs and docs in the +background and emits various reports. There is a common traversal mechanism +which finds all the dbs, design docs, docs and calls various reporting plugins. +Reporting plugins implement a common API and may search for various strings in +the doc bodies, examine design docs for certain features, check index sizes, or +issue any other reports. + +Two plugins are initially included with the application are: + * [couch_scanner_plugin_find](src/couch_scanner_plugin_find.erl) : Find + occurrences of any regular expressions in the cluster. It scans through + document bodies, db name and doc ids. This can be used to, for instance, to + search accidentally leaked secrets (API keys, passwords). + * [couch_scanner_ddoc_features](src/couch_scanner_ddoc_features.erl) : Report + on various features used by design docs. By default it will report features + which will be deprecated in CouchDB 4.x such as shows, lists, rewrites, + etc. But also can be configured to search for javascript filter definition, + custom javascript reduce functions and a few other things. + +By default no plugins are enabled so the scanner application won't do anything. +Plugins can be enabled in the configuration system by setting: + +For instance, to enable `couch_scanner_ddoc_features` plugin use: + +``` +[couch_scanner_plugins] +couch_scanner_plugin_ddoc_features = true +``` + +If a node is put in maintenance mode all the plugins will be automatically +stopped on that node. When node is put back in production plugins will +automatically resume executing. It's also possible to pause plugin execution in +a remsh using the `couch_scanner:stop()` and then resume it later with +`couch_scanner:resume()`. + +#### Application Structure + +Top level application API is in the [couch_scanner](src/couch_scanner.erl) module. + + * status() -> return running status of plugins + * stop() -> stop running plugins on this node + * resume() -> resume running plugins on this node + * checkpoints() -> inspect the value of all the node local checkpoints + * reset_checkpoints() -> delete all the node local checkpoints + +##### couch_scanner_server + +Plugins are run as individual processes. These processes are managed by +[couch_scanner_server](src/couch_scanner_server.erl) gen_server. The gen_server +inspects `couch_scanner_plugins` config section and then starts a new plugin +process for each configured plugin. Then it waits for the process to exit. When +the process exits it may exit normally, crash with an error, or indicate that +it should be rescheduled to run later. Later rescheduling is indicated by +exiting with a `{shutdown, {reschedule, UnixTimeSec}}` exit value. If the +plugin crashes repeatedly it will be penalized with an exponential back-off +starting at 30 seconds and up to 8 hours. + +##### couch_scanner_plugin + +Plugin processes are proc_lib processes spawned by the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module. After spawning the +processes will read the previously saved checkpoint from a `_local` checkpoint +doc in the `_dbs` db and start traversing databases from the last checkpoint. + +There is a plugin processes running for each configured plugin. On startup, +during traversal, checkpointing and before exiting it will call into the +corresponding API function. This works very much like the gen_server pattern. +The plugin API behavior is defined in the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module as `-callback` +directives. Plugin modules then refer to it as +`-behavior(couch_scanner_plugin)`. + +Periodically it will checkpoint the last database it processed so if the node +crashes or is stopped it will continue processing where it left off. + +During startup or when finished the plugin may exit with a +`{shutdown,{reschedule, UnixTimeSec}}` exit message to indicate that it wants +to be scheduled to run at a later time. + +##### couch_scanner_checkpoint + +Plugins periodically will checkpoint their database traversal progress to a +`_local` checkpoint doc in the `_dbs` database. Each plugin has their separate +checkpoint document. Plugin modules may implement the optional `checkpoint/1` +API and save some plugin specific data alongside the database traversal +checkpoint which gets saved automatically. For instance, it maybe useful for +plugins to save their start up configuration to detect when it changes so they +could restart their scanning. Or, it they want to accumulate some statistics +and only emit them at the end of the scan. + +Reading and writing to checkpoints is done in the +[couch_scanner_checkpoint](src/couch_scanner_checkpoint .erl) module. For Review Comment: Remove space between "couch_scanner_checkpoint" and ".erl"? ########## src/couch_scanner/README.md: ########## @@ -0,0 +1,107 @@ +Couch Scanner +============= + +Couch Scanner is an application which traverses all the dbs and docs in the +background and emits various reports. There is a common traversal mechanism +which finds all the dbs, design docs, docs and calls various reporting plugins. +Reporting plugins implement a common API and may search for various strings in +the doc bodies, examine design docs for certain features, check index sizes, or +issue any other reports. + +Two plugins are initially included with the application are: + * [couch_scanner_plugin_find](src/couch_scanner_plugin_find.erl) : Find + occurrences of any regular expressions in the cluster. It scans through + document bodies, db name and doc ids. This can be used to, for instance, to + search accidentally leaked secrets (API keys, passwords). + * [couch_scanner_ddoc_features](src/couch_scanner_ddoc_features.erl) : Report + on various features used by design docs. By default it will report features + which will be deprecated in CouchDB 4.x such as shows, lists, rewrites, + etc. But also can be configured to search for javascript filter definition, + custom javascript reduce functions and a few other things. + +By default no plugins are enabled so the scanner application won't do anything. +Plugins can be enabled in the configuration system by setting: + +For instance, to enable `couch_scanner_ddoc_features` plugin use: + +``` +[couch_scanner_plugins] +couch_scanner_plugin_ddoc_features = true +``` + +If a node is put in maintenance mode all the plugins will be automatically +stopped on that node. When node is put back in production plugins will +automatically resume executing. It's also possible to pause plugin execution in +a remsh using the `couch_scanner:stop()` and then resume it later with +`couch_scanner:resume()`. + +#### Application Structure + +Top level application API is in the [couch_scanner](src/couch_scanner.erl) module. + + * status() -> return running status of plugins + * stop() -> stop running plugins on this node + * resume() -> resume running plugins on this node + * checkpoints() -> inspect the value of all the node local checkpoints + * reset_checkpoints() -> delete all the node local checkpoints + +##### couch_scanner_server + +Plugins are run as individual processes. These processes are managed by +[couch_scanner_server](src/couch_scanner_server.erl) gen_server. The gen_server +inspects `couch_scanner_plugins` config section and then starts a new plugin +process for each configured plugin. Then it waits for the process to exit. When +the process exits it may exit normally, crash with an error, or indicate that +it should be rescheduled to run later. Later rescheduling is indicated by +exiting with a `{shutdown, {reschedule, UnixTimeSec}}` exit value. If the +plugin crashes repeatedly it will be penalized with an exponential back-off +starting at 30 seconds and up to 8 hours. + +##### couch_scanner_plugin + +Plugin processes are proc_lib processes spawned by the +[couch_scanner_plugin](src/couch_scanner_plugin.erl) module. After spawning the +processes will read the previously saved checkpoint from a `_local` checkpoint +doc in the `_dbs` db and start traversing databases from the last checkpoint. Review Comment: Not sure if too pedantic to say instead: "doc in the shards db (`_dbs` by default) ..."? ########## rel/overlay/etc/default.ini: ########## @@ -930,3 +930,99 @@ url = {{nouveau_url}} ;max_objects = 10000 ;max_idle = 600000 ;enable = true + +[couch_scanner] +; How often to check for configuration changes and start/stop plugins +;interval_sec = 5 + +; Minimum time to force a plugin to wait before running again after a crash +;min_penalty_sec = 30 + +; Maximum time to force a plugin to wait after repeated crashes (8 hours default) +;max_penalty_sec = 28800 + +; If plugin runs successfully without crashing for this long, reset its +; repeated error count +;heal_threshold_sec = 300 + +; Database processing rate limit per second. This will also be the +; rate at which design documents are fetched. The rate is shared +; across all running plugins. +;db_rate_limit = 50 + +; Limits the rate per second at which plugins may open db shard files +; on a node. The rate is shared across all running plugins. +;shard_rate_limit = 50 + +; Limit the rate per second at which plugins open documents. The rate +; is shared across all running plugins. +;doc_rate_limit = 1000 + +[couch_scanner_plugins] +;couch_scanner_plugin_ddoc_features = false +;couch_scanner_plugin_find = false + +; The following [$plugin*] settings apply to all plugins + +;[$plugin] +; Run plugin on or after this time. The default is to run once after the +; node starts. Times are in UTC. Possible time formats are: +; * Unix seconds: 1712338014 +; * Date/Time: YYYY-MM-DDTHH, YYYY-MM-DDTHH:MM, YYYY-MM-DDTHH:MM:SS, YYYY-MM-DDTHH:MM:SSZ +;after = restart +; Run the plugin periodically. By default it will run once after node the node starts. +; Possible period formats are: +; * $num_$timeunit: 1000_sec, 30_min, 8_hours, 24_hour, 2_days, 3_weeks, 1_month +; * $weekday: mon, monday, Thu, thursdays +;repeat = restart + +;[$plugin.skips_dbs] +; Skip over databases if their names contain any of the strings in this section. +;string1 = true +;string2 = true Review Comment: Perhaps another possibility could be to just have a single "skip" section, and then entries like e.g. ```ini [$plugin.skip] db_regex = regex1 doc_regex = regex2 ddoc_regex = regex3 ``` ? With such a design, I'm not sure if it's necessary for repeated keys, or if one could just construct more elaborate regexes using `|` etc. ########## src/couch_scanner/src/couch_scanner_plugin.erl: ########## @@ -0,0 +1,662 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +% Scanner plugin runner process +% +% This is the process which is spawned and run for each enabled plugin. +% +% A number of these processes are managed by the couch_scanner_server via +% start_link/1 and complete/1 functions. After a plugin runner is spawned, the only +% thing couch_scanner_server does is wait for it to exit. +% +% The plugin runner process may exit normally, crash, or exit with {shutdown, +% {reschedule, TSec}} if they want to reschedule to run again at some point the +% future (next day, a week later, etc). +% +% After the process starts, it will load and validate the plugin module. Then, +% it will start scanning all the dbs and docs on the local node. Shard ranges +% will be scanned only on one of the cluster nodes to avoid duplicating work. +% For instance, if there are 2 shard ranges, 0-7, 8-f, with copies on nodes n1, +% n2, n3. Then, 0-7 might be scanned on n1 only, and 8-f on n3. +% +% The plugin API defined in the behavior definition section. +% +% The start/2 function is called when the plugin starts running. It returns +% some context (St), which can be any Erlang term. All subsequent function +% calls will be called with the same St object, and may return an updated +% version of it. +% +% If the plugin hasn't finished runing and has resumed running after the node +% was restarted or an error happened, the resume/2 function will be called. +% That's the difference between start and resume: start/2 is called when the +% scan starts from the beginning (first db, first shard, ...), and resume/2 is +% called when the scanning hasn't finished and has to continue. +% +% If start/2 or resume/2 returns `reset` then the checkpoint will be reset and +% the plugin will be restarted. This may be useful in cases when the plugin +% detects configuration changes since last scanning session had already +% started, or when the plugin module was updated and the checkpoint version is +% stale. +% +% The checkpoint/1 callback is periodically called to checkpoint the scanning +% progress. start/2 and resume/2 function will be called with the last saved +% checkpoint map value. +% +% The complete/1 callback is called when the scan has finished. The complete +% callback should return final checkpoint map object. The last checkoint will +% be written and then. Review Comment: It seems like this sentence ends prematurely? ########## src/couch_scanner/src/couch_scanner_plugin.erl: ########## @@ -0,0 +1,662 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +% Scanner plugin runner process +% +% This is the process which is spawned and run for each enabled plugin. +% +% A number of these processes are managed by the couch_scanner_server via +% start_link/1 and complete/1 functions. After a plugin runner is spawned, the only +% thing couch_scanner_server does is wait for it to exit. +% +% The plugin runner process may exit normally, crash, or exit with {shutdown, +% {reschedule, TSec}} if they want to reschedule to run again at some point the +% future (next day, a week later, etc). +% +% After the process starts, it will load and validate the plugin module. Then, +% it will start scanning all the dbs and docs on the local node. Shard ranges +% will be scanned only on one of the cluster nodes to avoid duplicating work. +% For instance, if there are 2 shard ranges, 0-7, 8-f, with copies on nodes n1, +% n2, n3. Then, 0-7 might be scanned on n1 only, and 8-f on n3. +% +% The plugin API defined in the behavior definition section. +% +% The start/2 function is called when the plugin starts running. It returns +% some context (St), which can be any Erlang term. All subsequent function +% calls will be called with the same St object, and may return an updated +% version of it. +% +% If the plugin hasn't finished runing and has resumed running after the node +% was restarted or an error happened, the resume/2 function will be called. +% That's the difference between start and resume: start/2 is called when the +% scan starts from the beginning (first db, first shard, ...), and resume/2 is +% called when the scanning hasn't finished and has to continue. +% +% If start/2 or resume/2 returns `reset` then the checkpoint will be reset and +% the plugin will be restarted. This may be useful in cases when the plugin +% detects configuration changes since last scanning session had already +% started, or when the plugin module was updated and the checkpoint version is +% stale. +% +% The checkpoint/1 callback is periodically called to checkpoint the scanning +% progress. start/2 and resume/2 function will be called with the last saved +% checkpoint map value. +% +% The complete/1 callback is called when the scan has finished. The complete +% callback should return final checkpoint map object. The last checkoint will +% be written and then. +% +% As the cluster dbs, shards, ddocs and individual docs are discovered during +% scanning, the appropriate callbacks will be called. Most callbacks, besides +% the updated St object, can reply with ok, skip or complete tags. The meaning of +% those are: +% +% * ok - continue to the next object +% +% * skip - skip the current object and don't scan its internal (ex: skip a db +% and don't scan its ddocs, but continue with the next db) +% +% * stop - stop scanning any remaining objects of that type (ex: don't scan +% any more dbs) +% +% * reset - stop, reset the checkpoint data and restart, this may be useful +% if the configuration changes and it's best to just restart with the new +% settings + +-module(couch_scanner_plugin). + +-export([ + % Main plugin process API + spawn_link/2, + stop/1, + % Internal export + run/2 +]). + +-include_lib("couch_scanner/include/couch_scanner_plugin.hrl"). +-include_lib("couch_mrview/include/couch_mrview.hrl"). + +% Behaviour callback definitions + +-callback start(ScanId :: binary(), EJson :: #{}) -> + {ok, St :: term()} | skip | reset. + +-callback resume(ScanId :: binary(), EJson :: #{}) -> + {ok, St :: term()} | skip | reset. + +% Optional +-callback complete(St :: term()) -> + {ok, EJson :: #{}}. + +% Optional +-callback checkpoint(St :: term()) -> + {ok, EJson :: #{}}. + +-callback db(St :: term(), DbName :: binary()) -> + {ok | skip | stop, St1 :: term()}. + +% Optional +-callback ddoc(St :: term(), DbName :: binary(), #doc{}) -> + {ok | stop, St1 :: term()}. + +% Optional. If no subsequent callbacks are defined, then the default function +% returns [] (don't open any shards). If any subsequent callbacks are defined, +% the default action is to return all the shards in the list. +-callback shards(St :: term(), [#shard{}]) -> + {[#shard{}], St1 :: term()}. + +% Optional +-callback db_opened(St :: term(), Db :: term()) -> + {ok, St :: term()}. + +% Optional. If doc is not defined, then ddoc_id default action is {skip, St}. +% If it is defined, the default action is {ok, St}. +-callback doc_id(St :: term(), DocId :: binary(), Db :: term()) -> + {ok | skip | stop, St1 :: term()}. + +% Optional. +-callback doc(St :: term(), Db :: term(), #doc{}) -> + {ok | stop, St1 :: term()}. + +% Optional. +-callback db_closing(St :: term(), Db :: term()) -> + {ok, St1 :: term()}. + +-optional_callbacks([ + complete/1, + checkpoint/1, + ddoc/3, + shards/2, + db_opened/2, + doc_id/3, + doc/3, + db_closing/2 +]). + +-define(CALLBACKS, [ + {start, 2, fun required_callback/3}, + {resume, 2, fun required_callback/3}, + {complete, 1, fun default_complete/3}, + {checkpoint, 1, fun default_checkpoint/3}, + {db, 2, fun required_callback/3}, + {ddoc, 3, fun default_ddoc/3}, + {shards, 2, fun default_shards/3}, + {db_opened, 2, fun default_db_opened/3}, + {doc_id, 3, fun default_doc_id/3}, + {doc, 3, fun default_doc/3}, + {db_closing, 2, fun default_db_closing/3} +]). + +-define(CHECKPOINT_INTERVAL_SEC, 10). +-define(STOP_TIMEOUT_SEC, 5). + +-record(st, { + id, + rlimiter, + scan_id, + mod, + callbacks = #{}, + pst, + dbname, + cursor, + shards_db, + db, + checkpoint_sec = 0, + start_sec = 0, + skip_dbs, + skip_ddocs, + skip_docs +}). + +spawn_link(Id, RLimiter) -> + proc_lib:spawn_link(?MODULE, run, [Id, RLimiter]). + +stop(Pid) when is_pid(Pid) -> + unlink(Pid), + Ref = erlang:monitor(process, Pid), + Pid ! stop, + receive + {'DOWN', Ref, _, _, _} -> ok + after ?STOP_TIMEOUT_SEC * 1000 -> + exit(Pid, kill), + receive + {'DOWN', Ref, _, _, _} -> ok + end + end, + ok. + +% Main run function + +run(Id, RLimiter) -> + {Mod, Callbacks} = plugin_mod(Id), + St = #st{ + id = Id, + mod = Mod, + callbacks = Callbacks, + rlimiter = RLimiter + }, + St1 = init_config(St), + St2 = init_from_checkpoint(St1), + St3 = scan_dbs(St2), + finalize(St3). + +% Private functions + +init_config(#st{mod = Mod} = St) -> + St#st{ + skip_dbs = config_match_patterns(Mod, "skip_dbs"), + skip_ddocs = config_match_patterns(Mod, "skip_ddocs"), + skip_docs = config_match_patterns(Mod, "skip_docs") + }. + +init_from_checkpoint(#st{} = St) -> + #st{id = Id, mod = Mod, callbacks = Cbks} = St, + case couch_scanner_checkpoint:read(Id) of + #{ + <<"state">> := <<"running">>, + <<"cursor">> := Cur, + <<"scan_id">> := SId, + <<"pst">> := EJsonPSt, + <<"start_sec">> := StartSec + } -> + Now = tsec(), + PSt = resume_callback(Cbks, SId, EJsonPSt), + St#st{ + pst = PSt, + cursor = Cur, + checkpoint_sec = Now, + start_sec = StartSec, + scan_id = SId + }; + not_found -> + SId = couch_scanner_util:new_scan_id(), + Now = tsec(), + LastStartSec = 0, + Cur = <<>>, + PSt = start_callback(Mod, Cbks, Now, SId, LastStartSec, #{}), + ok = start_checkpoint(Id, Cbks, Now, SId, Cur, PSt), + St#st{ + pst = PSt, + cursor = Cur, + checkpoint_sec = 0, + start_sec = Now, + scan_id = SId + }; + #{ + <<"state">> := <<"finished">>, + <<"pst">> := EJson, + <<"start_sec">> := LastStartSec + } -> + SId = couch_scanner_util:new_scan_id(), + Now = tsec(), + Cur = <<>>, + PSt = start_callback(Mod, Cbks, Now, SId, LastStartSec, EJson), + ok = start_checkpoint(Id, Cbks, Now, SId, Cur, PSt), + St#st{ + pst = PSt, + cursor = Cur, + checkpoint_sec = Now, + start_sec = Now, + scan_id = SId + } + end. + +scan_dbs(#st{cursor = Cursor} = St) -> + DbsDbName = mem3_sync:shards_db(), Review Comment: Perhaps call this `ShardsDbName`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: notifications-unsubscr...@couchdb.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org