[GitHub] [incubator-tvm] comaniac commented on a change in pull request #6270: [Ansor][AutoTVM v2.0] Phase 1: XGBoost Cost Model

GitBox Thu, 13 Aug 2020 09:58:21 -0700


comaniac commented on a change in pull request #6270:
URL: https://github.com/apache/incubator-tvm/pull/6270#discussion_r470071293




##########
File path: python/tvm/auto_scheduler/auto_schedule.py
##########
@@ -161,7 +161,7 @@ def __init__(self, task, schedule_cost_model=RandomModel(), 
params=None, seed=No
             seed or random.randint(1, 1 << 30), verbose, init_search_callbacks)
 
     def generate_sketches(self, print_for_debug=False):
-        """ Generate the sketches, this is mainly used for debug.
+        """ Generate the sketches. This is mainly used for debugging and 
testing.

Review comment:
       I know the meaning of this description, but it may confuse people. Maybe 
we can either don't say this is mainly for debugging and testing, or explicitly 
say this is mainly for debugging and testing because auto scheduler uses them 
in the C++ side?

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional

Review comment:
       ```suggestion
           default: Optional[Any]
   ```

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.
+    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
+    where score_si is the score of Stage i in Program P.
+
+    We extract feature for each stage and let the xgboost predict the score 
for each stage.
+    We then sum up the predictions as the score of the whole program.
+
+    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - 
y)^2,
+    where P is the program and y is the normalized throughput according to
+    the ground truth (measurement).
+    XGBoost does not support this loss function because `score(P)` is a sum of 
the prediction
+    of several samples, so we implemented a custom loss function and call it 
pack-sum-rmse.
+    It is called "pack-sum" because we combine several samples into a "pack" 
and sum up
+    their predictions.
+    """
+    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        self.xgb_params = {
+            'max_depth': 10,
+            'gamma': 0.001,
+            'min_child_weight': 0,
+            'eta': 0.2,
+            # todo(merrymercy): automatically decrease learning rate when the 
loss is too large
+
+            'n_gpus': 0,
+            'nthread': multiprocessing.cpu_count() // 2,
+            'verbosity': 0,
+            'seed': seed or 43,
+            'disable_default_eval_metric': 1
+        }
+        self.bst = None
+        self.plan_size = 32
+        self.num_warmup_sample = num_warmup_sample
+        self.verbose_eval = verbose_eval
+
+        super().__init__()
+
+        # cache measurement input/result pairs and extracted features
+        self.inputs = []
+        self.results = []
+        self.inputs_feature_cache = []
+
+    def update(self, inputs, results):
+        """Update the cost model according to new measurement results 
(training data).
+
+        Parameters
+        ----------
+        inputs : List[MeasureInput]
+            The measurement inputs
+        results : List[MeasureResult]
+            The measurement results
+        """
+        if len(inputs) <= 0:

Review comment:
       We should also check/assert `len(inputs) == len(results)`.

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional

Review comment:
       ```suggestion
           value: Optional[Any]
   ```

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.

Review comment:
       ```suggestion
       the (approximated) score of a program = the sum of the scores of all 
stages in this program.
   ```

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.
+    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
+    where score_si is the score of Stage i in Program P.
+
+    We extract feature for each stage and let the xgboost predict the score 
for each stage.
+    We then sum up the predictions as the score of the whole program.
+
+    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - 
y)^2,
+    where P is the program and y is the normalized throughput according to
+    the ground truth (measurement).
+    XGBoost does not support this loss function because `score(P)` is a sum of 
the prediction
+    of several samples, so we implemented a custom loss function and call it 
pack-sum-rmse.
+    It is called "pack-sum" because we combine several samples into a "pack" 
and sum up
+    their predictions.
+    """
+    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        self.xgb_params = {
+            'max_depth': 10,
+            'gamma': 0.001,
+            'min_child_weight': 0,
+            'eta': 0.2,
+            # todo(merrymercy): automatically decrease learning rate when the 
loss is too large
+
+            'n_gpus': 0,
+            'nthread': multiprocessing.cpu_count() // 2,
+            'verbosity': 0,
+            'seed': seed or 43,
+            'disable_default_eval_metric': 1
+        }
+        self.bst = None
+        self.plan_size = 32
+        self.num_warmup_sample = num_warmup_sample
+        self.verbose_eval = verbose_eval
+
+        super().__init__()
+
+        # cache measurement input/result pairs and extracted features
+        self.inputs = []
+        self.results = []
+        self.inputs_feature_cache = []
+
+    def update(self, inputs, results):
+        """Update the cost model according to new measurement results 
(training data).
+
+        Parameters
+        ----------
+        inputs : List[MeasureInput]
+            The measurement inputs
+        results : List[MeasureResult]
+            The measurement results
+        """
+        if len(inputs) <= 0:
+            return
+
+        self.inputs.extend(inputs)
+        self.results.extend(results)
+
+        # extract feature
+        n_cached = len(self.inputs_feature_cache)
+        features, normalized_throughputs, task_ids = \
+            get_per_store_features_from_measure_pairs(self.inputs, 
self.results,
+                                                      
skip_first_n_feature_extraction=n_cached)
+        if n_cached > 0:
+            features = list(features)
+            features[:n_cached] = self.inputs_feature_cache
+            features = np.array(features, dtype=object)
+        self.inputs_feature_cache = features
+        dtrain = pack_sum_xgbmatrix(features, normalized_throughputs,
+                                    task_ids, normalized_throughputs)
+
+        # train xgb model
+        self.bst = xgb.train(self.xgb_params, dtrain,
+                             num_boost_round=10000,
+                             obj=pack_sum_square_error,
+                             callbacks=[custom_callback(
+                                 stopping_rounds=50,
+                                 metric='tr-p-rmse',
+                                 fevals=[
+                                     pack_sum_rmse, 
pack_sum_average_peak_score(self.plan_size),
+                                 ],
+                                 evals=[(dtrain, 'tr')],
+                                 maximize=False,
+                                 verbose_eval=self.verbose_eval)])
+
+    def predict(self, task, states):
+        """Predict the scores of states
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all states
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            ret = predict_throughput_pack_sum(raw_preds, pack_ids)
+        else:
+            ret = np.random.uniform(0, 1, (len(states),))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                ret[idx] = float('-inf')
+
+        return ret
+
+    def predict_stages(self, task, states):
+        """Predict the scores of all stages in states. This is the breakdown 
version of `predict`.
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all stages in all states in the packed 
format
+
+        Note
+        ----
+        For faster data copy between c++ and python, the python part returns 
scores in a
+        single flatten array using a packed format. The c++ part then unpacks 
the flatten array.
+
+        The packed format is:
+        {
+          float  scores[N];                 // scores[i] is the score for 
states[i].
+          int    n_stage_0;                 // the number of stages in 
states[0]
+          float  stage_scores_0[[n_stage_0] // the scores for all stages in 
states[0]
+          int    n_stage_1;                 // the number of stages in 
states[1]
+          float  stage_scores_1[n_stage_1]; // the scores for all stages in 
states[1]
+          ...
+          int    n_stage_i;                 // the number of stages in 
states[i]
+          float  stage_scores_1[n_stage_i]; // the scores for all stages in 
states[i]
+          ...  // untill i == N - 1
+        }
+        To implement this format, we also store int as float, so we can store 
all numbers
+        into a single float array.
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            breakdown = predict_throughput_pack_sum(raw_preds, pack_ids)
+            stage_scores = [[] for _ in range(len(states))]
+            for pred, pack_id in zip(raw_preds, pack_ids):
+                stage_scores[pack_id].append(pred)
+            for idx, stage_score in enumerate(stage_scores):
+                breakdown = np.append(breakdown, len(stage_score))
+                breakdown = np.concatenate((breakdown, np.array(stage_score)))
+        else:
+            breakdown = np.concatenate(
+                (np.random.uniform(0, 1, (len(states), )), 
np.zeros(len(states), )))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                breakdown[idx] = float('-inf')
+
+        return breakdown
+
+    def load_log_file(self, file_name, n_lines=None):
+        """Load measure records from a log file to pre-train the cost model
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        n_lines: int
+            Only

Review comment:
       - s/int/Optional[int]/
   - The description seems incomplete.

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.
+    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
+    where score_si is the score of Stage i in Program P.
+
+    We extract feature for each stage and let the xgboost predict the score 
for each stage.
+    We then sum up the predictions as the score of the whole program.
+
+    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - 
y)^2,
+    where P is the program and y is the normalized throughput according to
+    the ground truth (measurement).
+    XGBoost does not support this loss function because `score(P)` is a sum of 
the prediction
+    of several samples, so we implemented a custom loss function and call it 
pack-sum-rmse.
+    It is called "pack-sum" because we combine several samples into a "pack" 
and sum up
+    their predictions.
+    """
+    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        self.xgb_params = {
+            'max_depth': 10,
+            'gamma': 0.001,
+            'min_child_weight': 0,
+            'eta': 0.2,
+            # todo(merrymercy): automatically decrease learning rate when the 
loss is too large
+
+            'n_gpus': 0,
+            'nthread': multiprocessing.cpu_count() // 2,
+            'verbosity': 0,
+            'seed': seed or 43,
+            'disable_default_eval_metric': 1
+        }
+        self.bst = None
+        self.plan_size = 32
+        self.num_warmup_sample = num_warmup_sample
+        self.verbose_eval = verbose_eval
+
+        super().__init__()
+
+        # cache measurement input/result pairs and extracted features
+        self.inputs = []
+        self.results = []
+        self.inputs_feature_cache = []
+
+    def update(self, inputs, results):
+        """Update the cost model according to new measurement results 
(training data).

Review comment:
       1. It would be better to mention we will train a new model from scratch 
every time `update` is invoked.
   2. This function assumes that there won't be duplicated inputs because we 
guarantee the same schedule will be measured only once. It would be better to 
mention it as well.

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.
+    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
+    where score_si is the score of Stage i in Program P.
+
+    We extract feature for each stage and let the xgboost predict the score 
for each stage.
+    We then sum up the predictions as the score of the whole program.
+
+    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - 
y)^2,
+    where P is the program and y is the normalized throughput according to
+    the ground truth (measurement).
+    XGBoost does not support this loss function because `score(P)` is a sum of 
the prediction
+    of several samples, so we implemented a custom loss function and call it 
pack-sum-rmse.
+    It is called "pack-sum" because we combine several samples into a "pack" 
and sum up
+    their predictions.
+    """
+    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        self.xgb_params = {
+            'max_depth': 10,
+            'gamma': 0.001,
+            'min_child_weight': 0,
+            'eta': 0.2,
+            # todo(merrymercy): automatically decrease learning rate when the 
loss is too large
+
+            'n_gpus': 0,
+            'nthread': multiprocessing.cpu_count() // 2,
+            'verbosity': 0,
+            'seed': seed or 43,
+            'disable_default_eval_metric': 1
+        }
+        self.bst = None
+        self.plan_size = 32
+        self.num_warmup_sample = num_warmup_sample
+        self.verbose_eval = verbose_eval
+
+        super().__init__()
+
+        # cache measurement input/result pairs and extracted features
+        self.inputs = []
+        self.results = []
+        self.inputs_feature_cache = []
+
+    def update(self, inputs, results):
+        """Update the cost model according to new measurement results 
(training data).
+
+        Parameters
+        ----------
+        inputs : List[MeasureInput]
+            The measurement inputs
+        results : List[MeasureResult]
+            The measurement results
+        """
+        if len(inputs) <= 0:
+            return
+
+        self.inputs.extend(inputs)
+        self.results.extend(results)
+
+        # extract feature
+        n_cached = len(self.inputs_feature_cache)
+        features, normalized_throughputs, task_ids = \
+            get_per_store_features_from_measure_pairs(self.inputs, 
self.results,
+                                                      
skip_first_n_feature_extraction=n_cached)
+        if n_cached > 0:
+            features = list(features)
+            features[:n_cached] = self.inputs_feature_cache
+            features = np.array(features, dtype=object)
+        self.inputs_feature_cache = features
+        dtrain = pack_sum_xgbmatrix(features, normalized_throughputs,
+                                    task_ids, normalized_throughputs)
+
+        # train xgb model
+        self.bst = xgb.train(self.xgb_params, dtrain,
+                             num_boost_round=10000,
+                             obj=pack_sum_square_error,
+                             callbacks=[custom_callback(
+                                 stopping_rounds=50,
+                                 metric='tr-p-rmse',
+                                 fevals=[
+                                     pack_sum_rmse, 
pack_sum_average_peak_score(self.plan_size),
+                                 ],
+                                 evals=[(dtrain, 'tr')],
+                                 maximize=False,
+                                 verbose_eval=self.verbose_eval)])
+
+    def predict(self, task, states):
+        """Predict the scores of states
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all states
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            ret = predict_throughput_pack_sum(raw_preds, pack_ids)
+        else:
+            ret = np.random.uniform(0, 1, (len(states),))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                ret[idx] = float('-inf')
+
+        return ret
+
+    def predict_stages(self, task, states):
+        """Predict the scores of all stages in states. This is the breakdown 
version of `predict`.
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all stages in all states in the packed 
format
+
+        Note
+        ----
+        For faster data copy between c++ and python, the python part returns 
scores in a
+        single flatten array using a packed format. The c++ part then unpacks 
the flatten array.
+
+        The packed format is:
+        {
+          float  scores[N];                 // scores[i] is the score for 
states[i].
+          int    n_stage_0;                 // the number of stages in 
states[0]
+          float  stage_scores_0[[n_stage_0] // the scores for all stages in 
states[0]
+          int    n_stage_1;                 // the number of stages in 
states[1]
+          float  stage_scores_1[n_stage_1]; // the scores for all stages in 
states[1]
+          ...
+          int    n_stage_i;                 // the number of stages in 
states[i]
+          float  stage_scores_1[n_stage_i]; // the scores for all stages in 
states[i]
+          ...  // untill i == N - 1
+        }
+        To implement this format, we also store int as float, so we can store 
all numbers
+        into a single float array.
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            breakdown = predict_throughput_pack_sum(raw_preds, pack_ids)
+            stage_scores = [[] for _ in range(len(states))]
+            for pred, pack_id in zip(raw_preds, pack_ids):
+                stage_scores[pack_id].append(pred)
+            for idx, stage_score in enumerate(stage_scores):
+                breakdown = np.append(breakdown, len(stage_score))
+                breakdown = np.concatenate((breakdown, np.array(stage_score)))
+        else:
+            breakdown = np.concatenate(
+                (np.random.uniform(0, 1, (len(states), )), 
np.zeros(len(states), )))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                breakdown[idx] = float('-inf')
+
+        return breakdown
+
+    def load_log_file(self, file_name, n_lines=None):
+        """Load measure records from a log file to pre-train the cost model
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        n_lines: int
+            Only
+        """
+        inputs, results = RecordReader(file_name).read_lines(n_lines)
+        logger.info("XGBModel: Loaded %s measurement records from %s", 
len(inputs), file_name)
+        self.update(inputs, results)
+
+    def save(self, file_name: str):
+        """Save the model to a file
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        """
+        self.bst.save_model(file_name)
+
+    def load(self, file_name: str):
+        """Load the model from a file
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        """
+        if self.bst is None:
+            self.bst = xgb.Booster(self.xgb_params)
+        self.bst.load_model(file_name)
+        self.num_warmup_sample = -1
+
+
+def feature_to_pack_sum_xgbmatrix(xs):
+    """Convert an extracted multi-stage feature vector to a xgbmatrx in 
pack-sum format
+
+    Parameters
+    ----------
+    xs: np.ndarray
+        The feature vector
+
+    Returns
+    -------
+    dmatrix: xgb.DMatrix
+        The DMatrix
+    pack_ids: List[int]
+        pack ids information
+    """
+    x_flatten = []
+    pack_ids = []
+
+    for ct, x in enumerate(xs):
+        for row in x:
+            x_flatten.append(row)
+            pack_ids.append(ct)
+
+    return xgb.DMatrix(np.array(x_flatten)), pack_ids
+
+
+def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None):
+    """Convert (feature, label) pairs into a xgb matrix with pack-sum format
+
+    Parameters
+    ----------
+    xs: np.ndarray
+        The feature vector
+    ys: np.ndarray
+        The normaizlied throughput
+    gids: Optional[List[int]]
+        Group id (task id)
+    weights: Optional[np.ndarray]
+        The weight of samples
+
+    Returns
+    -------
+    dmatrix: xgb.DMatrix
+        The DMatrix with pack-sum information
+    """
+    if gids is not None:
+        # sort by group
+        indices = gids.argsort()
+        xs, ys = xs[indices], ys[indices]
+        group_sizes = np.bincount(gids)
+        if weights is not None:
+            weights = weights[indices]
+    else:
+        # assume it has only one group
+        group_sizes = [len(xs)]
+
+    x_flatten = []
+    y_flatten = []
+    weights_flatten = []
+    pack_ids = []
+
+    if weights is not None:
+        for ct, (x, y, w) in enumerate(zip(xs, ys, weights)):
+            for row in x:
+                x_flatten.append(row)
+                y_flatten.append(y)
+                weights_flatten.append(w)
+                pack_ids.append(ct)
+    else:
+        for ct, (x, y) in enumerate(zip(xs, ys)):
+            for row in x:
+                x_flatten.append(row)
+                y_flatten.append(y)
+                pack_ids.append(ct)
+
+    ret = xgb.DMatrix(np.array(x_flatten), y_flatten)
+    if weights is not None:
+        ret.set_weight(weights_flatten)
+    dmatrix_context.set('pack_ids', ret, np.array(pack_ids))
+    dmatrix_context.set('group_sizes', ret, group_sizes)
+    return ret
+
+
+def predict_throughput_pack_sum(raw_preds, pack_ids):
+    """Predict the throughputs for predictions in pack-sum format
+
+    Parameters
+    ----------
+    raw_preds: np.ndarray
+        The raw predictions
+    pack_ids: List[int]
+        The pack id for predictions
+
+    Returns
+    -------
+    throughputs: np.ndarray
+        The throughput
+    """
+    sum_pred = np.bincount(pack_ids, weights=raw_preds)
+    return sum_pred
+
+def pack_sum_square_error(preds, dtrain):
+    """Implement square error loss on pack-sum format as
+     a custom objective function for xgboost.
+
+    Parameters
+    ----------
+    preds: np.ndarray
+        The predicitons
+    dtrain: xgb.DMatrix
+        The training set
+
+    Returns
+    -------
+    gradient and hessian

Review comment:
       Type missing.

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.
+    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
+    where score_si is the score of Stage i in Program P.
+
+    We extract feature for each stage and let the xgboost predict the score 
for each stage.
+    We then sum up the predictions as the score of the whole program.
+
+    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - 
y)^2,
+    where P is the program and y is the normalized throughput according to
+    the ground truth (measurement).
+    XGBoost does not support this loss function because `score(P)` is a sum of 
the prediction
+    of several samples, so we implemented a custom loss function and call it 
pack-sum-rmse.
+    It is called "pack-sum" because we combine several samples into a "pack" 
and sum up
+    their predictions.
+    """
+    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        self.xgb_params = {
+            'max_depth': 10,
+            'gamma': 0.001,
+            'min_child_weight': 0,
+            'eta': 0.2,
+            # todo(merrymercy): automatically decrease learning rate when the 
loss is too large
+
+            'n_gpus': 0,
+            'nthread': multiprocessing.cpu_count() // 2,
+            'verbosity': 0,
+            'seed': seed or 43,
+            'disable_default_eval_metric': 1
+        }
+        self.bst = None
+        self.plan_size = 32
+        self.num_warmup_sample = num_warmup_sample
+        self.verbose_eval = verbose_eval
+
+        super().__init__()
+
+        # cache measurement input/result pairs and extracted features
+        self.inputs = []
+        self.results = []
+        self.inputs_feature_cache = []
+
+    def update(self, inputs, results):
+        """Update the cost model according to new measurement results 
(training data).
+
+        Parameters
+        ----------
+        inputs : List[MeasureInput]
+            The measurement inputs
+        results : List[MeasureResult]
+            The measurement results
+        """
+        if len(inputs) <= 0:
+            return
+
+        self.inputs.extend(inputs)
+        self.results.extend(results)
+
+        # extract feature
+        n_cached = len(self.inputs_feature_cache)
+        features, normalized_throughputs, task_ids = \
+            get_per_store_features_from_measure_pairs(self.inputs, 
self.results,
+                                                      
skip_first_n_feature_extraction=n_cached)
+        if n_cached > 0:
+            features = list(features)
+            features[:n_cached] = self.inputs_feature_cache
+            features = np.array(features, dtype=object)
+        self.inputs_feature_cache = features
+        dtrain = pack_sum_xgbmatrix(features, normalized_throughputs,
+                                    task_ids, normalized_throughputs)
+
+        # train xgb model
+        self.bst = xgb.train(self.xgb_params, dtrain,
+                             num_boost_round=10000,
+                             obj=pack_sum_square_error,
+                             callbacks=[custom_callback(
+                                 stopping_rounds=50,
+                                 metric='tr-p-rmse',
+                                 fevals=[
+                                     pack_sum_rmse, 
pack_sum_average_peak_score(self.plan_size),
+                                 ],
+                                 evals=[(dtrain, 'tr')],
+                                 maximize=False,
+                                 verbose_eval=self.verbose_eval)])
+
+    def predict(self, task, states):
+        """Predict the scores of states
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all states
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            ret = predict_throughput_pack_sum(raw_preds, pack_ids)
+        else:
+            ret = np.random.uniform(0, 1, (len(states),))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                ret[idx] = float('-inf')
+
+        return ret
+
+    def predict_stages(self, task, states):
+        """Predict the scores of all stages in states. This is the breakdown 
version of `predict`.
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all stages in all states in the packed 
format
+
+        Note
+        ----
+        For faster data copy between c++ and python, the python part returns 
scores in a
+        single flatten array using a packed format. The c++ part then unpacks 
the flatten array.
+
+        The packed format is:
+        {
+          float  scores[N];                 // scores[i] is the score for 
states[i].
+          int    n_stage_0;                 // the number of stages in 
states[0]
+          float  stage_scores_0[[n_stage_0] // the scores for all stages in 
states[0]
+          int    n_stage_1;                 // the number of stages in 
states[1]
+          float  stage_scores_1[n_stage_1]; // the scores for all stages in 
states[1]
+          ...
+          int    n_stage_i;                 // the number of stages in 
states[i]
+          float  stage_scores_1[n_stage_i]; // the scores for all stages in 
states[i]
+          ...  // untill i == N - 1
+        }
+        To implement this format, we also store int as float, so we can store 
all numbers
+        into a single float array.
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            breakdown = predict_throughput_pack_sum(raw_preds, pack_ids)
+            stage_scores = [[] for _ in range(len(states))]
+            for pred, pack_id in zip(raw_preds, pack_ids):
+                stage_scores[pack_id].append(pred)
+            for idx, stage_score in enumerate(stage_scores):
+                breakdown = np.append(breakdown, len(stage_score))
+                breakdown = np.concatenate((breakdown, np.array(stage_score)))
+        else:
+            breakdown = np.concatenate(
+                (np.random.uniform(0, 1, (len(states), )), 
np.zeros(len(states), )))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                breakdown[idx] = float('-inf')
+
+        return breakdown
+
+    def load_log_file(self, file_name, n_lines=None):

Review comment:
       From the functionality, it might be better to name it `update_from_file` 
or something like that.

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.
+    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
+    where score_si is the score of Stage i in Program P.
+
+    We extract feature for each stage and let the xgboost predict the score 
for each stage.
+    We then sum up the predictions as the score of the whole program.
+
+    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - 
y)^2,
+    where P is the program and y is the normalized throughput according to
+    the ground truth (measurement).
+    XGBoost does not support this loss function because `score(P)` is a sum of 
the prediction
+    of several samples, so we implemented a custom loss function and call it 
pack-sum-rmse.
+    It is called "pack-sum" because we combine several samples into a "pack" 
and sum up
+    their predictions.
+    """
+    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        self.xgb_params = {
+            'max_depth': 10,
+            'gamma': 0.001,
+            'min_child_weight': 0,
+            'eta': 0.2,
+            # todo(merrymercy): automatically decrease learning rate when the 
loss is too large
+
+            'n_gpus': 0,
+            'nthread': multiprocessing.cpu_count() // 2,
+            'verbosity': 0,
+            'seed': seed or 43,
+            'disable_default_eval_metric': 1
+        }
+        self.bst = None
+        self.plan_size = 32
+        self.num_warmup_sample = num_warmup_sample
+        self.verbose_eval = verbose_eval
+
+        super().__init__()
+
+        # cache measurement input/result pairs and extracted features
+        self.inputs = []
+        self.results = []
+        self.inputs_feature_cache = []
+
+    def update(self, inputs, results):
+        """Update the cost model according to new measurement results 
(training data).
+
+        Parameters
+        ----------
+        inputs : List[MeasureInput]
+            The measurement inputs
+        results : List[MeasureResult]
+            The measurement results
+        """
+        if len(inputs) <= 0:
+            return
+
+        self.inputs.extend(inputs)
+        self.results.extend(results)
+
+        # extract feature
+        n_cached = len(self.inputs_feature_cache)
+        features, normalized_throughputs, task_ids = \
+            get_per_store_features_from_measure_pairs(self.inputs, 
self.results,
+                                                      
skip_first_n_feature_extraction=n_cached)
+        if n_cached > 0:
+            features = list(features)
+            features[:n_cached] = self.inputs_feature_cache
+            features = np.array(features, dtype=object)
+        self.inputs_feature_cache = features
+        dtrain = pack_sum_xgbmatrix(features, normalized_throughputs,
+                                    task_ids, normalized_throughputs)
+
+        # train xgb model
+        self.bst = xgb.train(self.xgb_params, dtrain,
+                             num_boost_round=10000,
+                             obj=pack_sum_square_error,
+                             callbacks=[custom_callback(
+                                 stopping_rounds=50,
+                                 metric='tr-p-rmse',
+                                 fevals=[
+                                     pack_sum_rmse, 
pack_sum_average_peak_score(self.plan_size),
+                                 ],
+                                 evals=[(dtrain, 'tr')],
+                                 maximize=False,
+                                 verbose_eval=self.verbose_eval)])
+
+    def predict(self, task, states):
+        """Predict the scores of states
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all states
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            ret = predict_throughput_pack_sum(raw_preds, pack_ids)
+        else:
+            ret = np.random.uniform(0, 1, (len(states),))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                ret[idx] = float('-inf')
+
+        return ret
+
+    def predict_stages(self, task, states):
+        """Predict the scores of all stages in states. This is the breakdown 
version of `predict`.
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all stages in all states in the packed 
format
+
+        Note
+        ----
+        For faster data copy between c++ and python, the python part returns 
scores in a
+        single flatten array using a packed format. The c++ part then unpacks 
the flatten array.
+
+        The packed format is:
+        {
+          float  scores[N];                 // scores[i] is the score for 
states[i].
+          int    n_stage_0;                 // the number of stages in 
states[0]
+          float  stage_scores_0[[n_stage_0] // the scores for all stages in 
states[0]
+          int    n_stage_1;                 // the number of stages in 
states[1]
+          float  stage_scores_1[n_stage_1]; // the scores for all stages in 
states[1]
+          ...
+          int    n_stage_i;                 // the number of stages in 
states[i]
+          float  stage_scores_1[n_stage_i]; // the scores for all stages in 
states[i]
+          ...  // untill i == N - 1
+        }
+        To implement this format, we also store int as float, so we can store 
all numbers
+        into a single float array.
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            breakdown = predict_throughput_pack_sum(raw_preds, pack_ids)
+            stage_scores = [[] for _ in range(len(states))]
+            for pred, pack_id in zip(raw_preds, pack_ids):
+                stage_scores[pack_id].append(pred)
+            for idx, stage_score in enumerate(stage_scores):
+                breakdown = np.append(breakdown, len(stage_score))
+                breakdown = np.concatenate((breakdown, np.array(stage_score)))
+        else:
+            breakdown = np.concatenate(
+                (np.random.uniform(0, 1, (len(states), )), 
np.zeros(len(states), )))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                breakdown[idx] = float('-inf')
+
+        return breakdown
+
+    def load_log_file(self, file_name, n_lines=None):
+        """Load measure records from a log file to pre-train the cost model
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        n_lines: int
+            Only
+        """
+        inputs, results = RecordReader(file_name).read_lines(n_lines)
+        logger.info("XGBModel: Loaded %s measurement records from %s", 
len(inputs), file_name)
+        self.update(inputs, results)
+
+    def save(self, file_name: str):
+        """Save the model to a file
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        """
+        self.bst.save_model(file_name)
+
+    def load(self, file_name: str):
+        """Load the model from a file
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        """
+        if self.bst is None:
+            self.bst = xgb.Booster(self.xgb_params)
+        self.bst.load_model(file_name)
+        self.num_warmup_sample = -1
+
+
+def feature_to_pack_sum_xgbmatrix(xs):
+    """Convert an extracted multi-stage feature vector to a xgbmatrx in 
pack-sum format
+
+    Parameters
+    ----------
+    xs: np.ndarray
+        The feature vector
+
+    Returns
+    -------
+    dmatrix: xgb.DMatrix
+        The DMatrix
+    pack_ids: List[int]
+        pack ids information
+    """
+    x_flatten = []
+    pack_ids = []
+
+    for ct, x in enumerate(xs):
+        for row in x:
+            x_flatten.append(row)
+            pack_ids.append(ct)
+
+    return xgb.DMatrix(np.array(x_flatten)), pack_ids
+
+
+def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None):
+    """Convert (feature, label) pairs into a xgb matrix with pack-sum format
+
+    Parameters
+    ----------
+    xs: np.ndarray
+        The feature vector
+    ys: np.ndarray
+        The normaizlied throughput
+    gids: Optional[List[int]]
+        Group id (task id)
+    weights: Optional[np.ndarray]
+        The weight of samples
+
+    Returns
+    -------
+    dmatrix: xgb.DMatrix
+        The DMatrix with pack-sum information
+    """
+    if gids is not None:
+        # sort by group
+        indices = gids.argsort()
+        xs, ys = xs[indices], ys[indices]
+        group_sizes = np.bincount(gids)
+        if weights is not None:
+            weights = weights[indices]
+    else:
+        # assume it has only one group
+        group_sizes = [len(xs)]
+
+    x_flatten = []
+    y_flatten = []
+    weights_flatten = []
+    pack_ids = []
+
+    if weights is not None:
+        for ct, (x, y, w) in enumerate(zip(xs, ys, weights)):
+            for row in x:
+                x_flatten.append(row)
+                y_flatten.append(y)
+                weights_flatten.append(w)
+                pack_ids.append(ct)
+    else:
+        for ct, (x, y) in enumerate(zip(xs, ys)):
+            for row in x:
+                x_flatten.append(row)
+                y_flatten.append(y)
+                pack_ids.append(ct)
+
+    ret = xgb.DMatrix(np.array(x_flatten), y_flatten)
+    if weights is not None:
+        ret.set_weight(weights_flatten)
+    dmatrix_context.set('pack_ids', ret, np.array(pack_ids))
+    dmatrix_context.set('group_sizes', ret, group_sizes)
+    return ret
+
+
+def predict_throughput_pack_sum(raw_preds, pack_ids):
+    """Predict the throughputs for predictions in pack-sum format
+
+    Parameters
+    ----------
+    raw_preds: np.ndarray
+        The raw predictions
+    pack_ids: List[int]
+        The pack id for predictions
+
+    Returns
+    -------
+    throughputs: np.ndarray
+        The throughput
+    """
+    sum_pred = np.bincount(pack_ids, weights=raw_preds)
+    return sum_pred
+
+def pack_sum_square_error(preds, dtrain):
+    """Implement square error loss on pack-sum format as
+     a custom objective function for xgboost.
+
+    Parameters
+    ----------
+    preds: np.ndarray
+        The predicitons
+    dtrain: xgb.DMatrix
+        The training set
+
+    Returns
+    -------
+    gradient and hessian
+    """
+    pack_ids = dmatrix_context.get("pack_ids", dtrain)
+    weight = dtrain.get_weight()
+
+    sum_pred = np.bincount(pack_ids, weights=preds)
+    x = sum_pred[pack_ids]
+    y = dtrain.get_label()
+    gradient = x - y
+    hessian = np.ones_like(gradient)
+
+    if len(weight) == 0:

Review comment:
       ```suggestion
       if not weight:
   ```

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.
+    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
+    where score_si is the score of Stage i in Program P.
+
+    We extract feature for each stage and let the xgboost predict the score 
for each stage.
+    We then sum up the predictions as the score of the whole program.
+
+    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - 
y)^2,
+    where P is the program and y is the normalized throughput according to
+    the ground truth (measurement).
+    XGBoost does not support this loss function because `score(P)` is a sum of 
the prediction
+    of several samples, so we implemented a custom loss function and call it 
pack-sum-rmse.
+    It is called "pack-sum" because we combine several samples into a "pack" 
and sum up
+    their predictions.
+    """
+    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        self.xgb_params = {
+            'max_depth': 10,
+            'gamma': 0.001,
+            'min_child_weight': 0,
+            'eta': 0.2,
+            # todo(merrymercy): automatically decrease learning rate when the 
loss is too large
+
+            'n_gpus': 0,
+            'nthread': multiprocessing.cpu_count() // 2,
+            'verbosity': 0,
+            'seed': seed or 43,
+            'disable_default_eval_metric': 1
+        }
+        self.bst = None
+        self.plan_size = 32
+        self.num_warmup_sample = num_warmup_sample
+        self.verbose_eval = verbose_eval
+
+        super().__init__()
+
+        # cache measurement input/result pairs and extracted features
+        self.inputs = []
+        self.results = []
+        self.inputs_feature_cache = []
+
+    def update(self, inputs, results):
+        """Update the cost model according to new measurement results 
(training data).
+
+        Parameters
+        ----------
+        inputs : List[MeasureInput]
+            The measurement inputs
+        results : List[MeasureResult]
+            The measurement results
+        """
+        if len(inputs) <= 0:
+            return
+
+        self.inputs.extend(inputs)
+        self.results.extend(results)
+
+        # extract feature
+        n_cached = len(self.inputs_feature_cache)
+        features, normalized_throughputs, task_ids = \
+            get_per_store_features_from_measure_pairs(self.inputs, 
self.results,
+                                                      
skip_first_n_feature_extraction=n_cached)
+        if n_cached > 0:
+            features = list(features)
+            features[:n_cached] = self.inputs_feature_cache
+            features = np.array(features, dtype=object)
+        self.inputs_feature_cache = features
+        dtrain = pack_sum_xgbmatrix(features, normalized_throughputs,
+                                    task_ids, normalized_throughputs)
+
+        # train xgb model
+        self.bst = xgb.train(self.xgb_params, dtrain,
+                             num_boost_round=10000,
+                             obj=pack_sum_square_error,
+                             callbacks=[custom_callback(
+                                 stopping_rounds=50,
+                                 metric='tr-p-rmse',
+                                 fevals=[
+                                     pack_sum_rmse, 
pack_sum_average_peak_score(self.plan_size),
+                                 ],
+                                 evals=[(dtrain, 'tr')],
+                                 maximize=False,
+                                 verbose_eval=self.verbose_eval)])
+
+    def predict(self, task, states):
+        """Predict the scores of states
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all states
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            ret = predict_throughput_pack_sum(raw_preds, pack_ids)
+        else:
+            ret = np.random.uniform(0, 1, (len(states),))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                ret[idx] = float('-inf')
+
+        return ret
+
+    def predict_stages(self, task, states):
+        """Predict the scores of all stages in states. This is the breakdown 
version of `predict`.
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all stages in all states in the packed 
format
+
+        Note
+        ----
+        For faster data copy between c++ and python, the python part returns 
scores in a
+        single flatten array using a packed format. The c++ part then unpacks 
the flatten array.
+
+        The packed format is:
+        {
+          float  scores[N];                 // scores[i] is the score for 
states[i].
+          int    n_stage_0;                 // the number of stages in 
states[0]
+          float  stage_scores_0[[n_stage_0] // the scores for all stages in 
states[0]
+          int    n_stage_1;                 // the number of stages in 
states[1]
+          float  stage_scores_1[n_stage_1]; // the scores for all stages in 
states[1]
+          ...
+          int    n_stage_i;                 // the number of stages in 
states[i]
+          float  stage_scores_1[n_stage_i]; // the scores for all stages in 
states[i]
+          ...  // untill i == N - 1
+        }
+        To implement this format, we also store int as float, so we can store 
all numbers
+        into a single float array.
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            breakdown = predict_throughput_pack_sum(raw_preds, pack_ids)
+            stage_scores = [[] for _ in range(len(states))]
+            for pred, pack_id in zip(raw_preds, pack_ids):
+                stage_scores[pack_id].append(pred)
+            for idx, stage_score in enumerate(stage_scores):
+                breakdown = np.append(breakdown, len(stage_score))
+                breakdown = np.concatenate((breakdown, np.array(stage_score)))
+        else:
+            breakdown = np.concatenate(
+                (np.random.uniform(0, 1, (len(states), )), 
np.zeros(len(states), )))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                breakdown[idx] = float('-inf')
+
+        return breakdown
+
+    def load_log_file(self, file_name, n_lines=None):
+        """Load measure records from a log file to pre-train the cost model
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        n_lines: int
+            Only
+        """
+        inputs, results = RecordReader(file_name).read_lines(n_lines)
+        logger.info("XGBModel: Loaded %s measurement records from %s", 
len(inputs), file_name)
+        self.update(inputs, results)
+
+    def save(self, file_name: str):
+        """Save the model to a file
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        """
+        self.bst.save_model(file_name)
+
+    def load(self, file_name: str):
+        """Load the model from a file
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        """
+        if self.bst is None:
+            self.bst = xgb.Booster(self.xgb_params)
+        self.bst.load_model(file_name)
+        self.num_warmup_sample = -1
+
+
+def feature_to_pack_sum_xgbmatrix(xs):
+    """Convert an extracted multi-stage feature vector to a xgbmatrx in 
pack-sum format
+
+    Parameters
+    ----------
+    xs: np.ndarray
+        The feature vector
+
+    Returns
+    -------
+    dmatrix: xgb.DMatrix
+        The DMatrix
+    pack_ids: List[int]
+        pack ids information
+    """
+    x_flatten = []
+    pack_ids = []
+
+    for ct, x in enumerate(xs):
+        for row in x:
+            x_flatten.append(row)
+            pack_ids.append(ct)
+
+    return xgb.DMatrix(np.array(x_flatten)), pack_ids
+
+
+def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None):
+    """Convert (feature, label) pairs into a xgb matrix with pack-sum format
+
+    Parameters
+    ----------
+    xs: np.ndarray
+        The feature vector
+    ys: np.ndarray
+        The normaizlied throughput
+    gids: Optional[List[int]]
+        Group id (task id)
+    weights: Optional[np.ndarray]
+        The weight of samples
+
+    Returns
+    -------
+    dmatrix: xgb.DMatrix
+        The DMatrix with pack-sum information
+    """
+    if gids is not None:
+        # sort by group
+        indices = gids.argsort()
+        xs, ys = xs[indices], ys[indices]
+        group_sizes = np.bincount(gids)
+        if weights is not None:
+            weights = weights[indices]
+    else:
+        # assume it has only one group
+        group_sizes = [len(xs)]
+
+    x_flatten = []
+    y_flatten = []
+    weights_flatten = []
+    pack_ids = []
+
+    if weights is not None:
+        for ct, (x, y, w) in enumerate(zip(xs, ys, weights)):
+            for row in x:
+                x_flatten.append(row)
+                y_flatten.append(y)
+                weights_flatten.append(w)
+                pack_ids.append(ct)
+    else:
+        for ct, (x, y) in enumerate(zip(xs, ys)):
+            for row in x:
+                x_flatten.append(row)
+                y_flatten.append(y)
+                pack_ids.append(ct)
+
+    ret = xgb.DMatrix(np.array(x_flatten), y_flatten)
+    if weights is not None:
+        ret.set_weight(weights_flatten)
+    dmatrix_context.set('pack_ids', ret, np.array(pack_ids))
+    dmatrix_context.set('group_sizes', ret, group_sizes)
+    return ret
+
+
+def predict_throughput_pack_sum(raw_preds, pack_ids):
+    """Predict the throughputs for predictions in pack-sum format
+
+    Parameters
+    ----------
+    raw_preds: np.ndarray
+        The raw predictions
+    pack_ids: List[int]
+        The pack id for predictions
+
+    Returns
+    -------
+    throughputs: np.ndarray
+        The throughput
+    """
+    sum_pred = np.bincount(pack_ids, weights=raw_preds)
+    return sum_pred
+
+def pack_sum_square_error(preds, dtrain):
+    """Implement square error loss on pack-sum format as
+     a custom objective function for xgboost.
+
+    Parameters
+    ----------
+    preds: np.ndarray
+        The predicitons
+    dtrain: xgb.DMatrix
+        The training set
+
+    Returns
+    -------
+    gradient and hessian
+    """
+    pack_ids = dmatrix_context.get("pack_ids", dtrain)
+    weight = dtrain.get_weight()
+
+    sum_pred = np.bincount(pack_ids, weights=preds)
+    x = sum_pred[pack_ids]
+    y = dtrain.get_label()
+    gradient = x - y
+    hessian = np.ones_like(gradient)
+
+    if len(weight) == 0:
+        return gradient, hessian
+
+    return gradient * weight, hessian * weight
+
+def pack_sum_rmse(raw_preds, labels):
+    """Evaluate RMSE (rooted mean square error) in the pack-sum format
+
+    Parameters
+    ----------
+    raw_preds: np.ndarray
+        The raw prediction
+    labels: xgb.DMatrix
+        The groud-truth label matrix
+
+    Returns
+    -------
+    The name and value of the metric
+    """
+    pack_ids = dmatrix_context.get("pack_ids", labels)
+    preds = predict_throughput_pack_sum(raw_preds, pack_ids)[pack_ids]
+    return 'p-rmse', np.sqrt(np.mean(np.square((preds - labels.get_label()))))
+
+def pack_sum_average_peak_score(N):
+    """Return the evaluation function for average-peak-score@N
+
+    Parameters
+    ----------
+    N: int
+        The "N" in "average-peak-score@N"
+
+    Returns
+    -------
+    The evaluation function
+    """
+
+    def feval(preds, labels):
+        """Evaluate average-peak-score@N in the pack-sum format
+
+        Parameters
+        ----------
+        raw_preds: np.ndarray
+            The raw prediction
+        labels: xgb.DMatrix
+            The groud-truth label matrix
+
+        Returns
+        -------
+        The name and value of the metric

Review comment:
       Type missing.

##########
File path: python/tvm/auto_scheduler/cost_model/xgb_model.py
##########
@@ -0,0 +1,590 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Cost model based on xgboost"""
+import multiprocessing
+import logging
+from collections import defaultdict
+
+import numpy as np
+import xgboost as xgb
+from xgboost.core import EarlyStopException
+from xgboost.callback import _fmt_metric
+from xgboost.training import aggcv
+
+from tvm.autotvm.tuner.metric import max_curve
+from .cost_model import PythonBasedModel
+from ..feature import get_per_store_features_from_measure_pairs, 
get_per_store_features_from_states
+from ..measure_record import RecordReader
+
+logger = logging.getLogger('auto_scheduler')
+
+class XGBDMatrixContext:
+    """A global context to hold additional attributes of xgb.DMatrix"""
+    def __init__(self):
+        self.context_dict = defaultdict(dict)
+
+    def get(self, key, matrix, default=None):
+        """
+        Get an attribute of a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        default: Optional
+            The default value if the item does not exist
+        """
+        return self.context_dict[key].get(matrix.handle.value, default)
+
+    def set(self, key, matrix, value):
+        """
+        Set an attribute for a xgb.DMatrix
+
+        Parameters
+        ----------
+        key: str
+            The name of the attribute
+        matrix: xgb.DMatrix
+            The matrix
+        value: Optional
+            The new value
+        """
+        self.context_dict[key][matrix.handle.value] = value
+
+dmatrix_context = XGBDMatrixContext()
+
+
+class XGBModel(PythonBasedModel):
+    """Train a XGBoost model to predict the normalized throughputs of programs.
+
+    Let the normalized throughput be the score of a program (higher is 
better). We predict
+    (approximiate) the score of a program = the sum of the scores of all 
stages in this program.
+    i.e. score(P) = score_s0 + score_s1 + ... + score_sn,
+    where score_si is the score of Stage i in Program P.
+
+    We extract feature for each stage and let the xgboost predict the score 
for each stage.
+    We then sum up the predictions as the score of the whole program.
+
+    We use RMSE as the loss function.  i.e. loss(P, y) = 1/2 * (score(P) - 
y)^2,
+    where P is the program and y is the normalized throughput according to
+    the ground truth (measurement).
+    XGBoost does not support this loss function because `score(P)` is a sum of 
the prediction
+    of several samples, so we implemented a custom loss function and call it 
pack-sum-rmse.
+    It is called "pack-sum" because we combine several samples into a "pack" 
and sum up
+    their predictions.
+    """
+    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
+        self.xgb_params = {
+            'max_depth': 10,
+            'gamma': 0.001,
+            'min_child_weight': 0,
+            'eta': 0.2,
+            # todo(merrymercy): automatically decrease learning rate when the 
loss is too large
+
+            'n_gpus': 0,
+            'nthread': multiprocessing.cpu_count() // 2,
+            'verbosity': 0,
+            'seed': seed or 43,
+            'disable_default_eval_metric': 1
+        }
+        self.bst = None
+        self.plan_size = 32
+        self.num_warmup_sample = num_warmup_sample
+        self.verbose_eval = verbose_eval
+
+        super().__init__()
+
+        # cache measurement input/result pairs and extracted features
+        self.inputs = []
+        self.results = []
+        self.inputs_feature_cache = []
+
+    def update(self, inputs, results):
+        """Update the cost model according to new measurement results 
(training data).
+
+        Parameters
+        ----------
+        inputs : List[MeasureInput]
+            The measurement inputs
+        results : List[MeasureResult]
+            The measurement results
+        """
+        if len(inputs) <= 0:
+            return
+
+        self.inputs.extend(inputs)
+        self.results.extend(results)
+
+        # extract feature
+        n_cached = len(self.inputs_feature_cache)
+        features, normalized_throughputs, task_ids = \
+            get_per_store_features_from_measure_pairs(self.inputs, 
self.results,
+                                                      
skip_first_n_feature_extraction=n_cached)
+        if n_cached > 0:
+            features = list(features)
+            features[:n_cached] = self.inputs_feature_cache
+            features = np.array(features, dtype=object)
+        self.inputs_feature_cache = features
+        dtrain = pack_sum_xgbmatrix(features, normalized_throughputs,
+                                    task_ids, normalized_throughputs)
+
+        # train xgb model
+        self.bst = xgb.train(self.xgb_params, dtrain,
+                             num_boost_round=10000,
+                             obj=pack_sum_square_error,
+                             callbacks=[custom_callback(
+                                 stopping_rounds=50,
+                                 metric='tr-p-rmse',
+                                 fevals=[
+                                     pack_sum_rmse, 
pack_sum_average_peak_score(self.plan_size),
+                                 ],
+                                 evals=[(dtrain, 'tr')],
+                                 maximize=False,
+                                 verbose_eval=self.verbose_eval)])
+
+    def predict(self, task, states):
+        """Predict the scores of states
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all states
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            ret = predict_throughput_pack_sum(raw_preds, pack_ids)
+        else:
+            ret = np.random.uniform(0, 1, (len(states),))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                ret[idx] = float('-inf')
+
+        return ret
+
+    def predict_stages(self, task, states):
+        """Predict the scores of all stages in states. This is the breakdown 
version of `predict`.
+
+        Parameters
+        ----------
+        search_task : SearchTask
+            The search task of states
+        statse : List[State]
+            The input states
+
+        Returns
+        -------
+        scores: List[float]
+            The predicted scores for all stages in all states in the packed 
format
+
+        Note
+        ----
+        For faster data copy between c++ and python, the python part returns 
scores in a
+        single flatten array using a packed format. The c++ part then unpacks 
the flatten array.
+
+        The packed format is:
+        {
+          float  scores[N];                 // scores[i] is the score for 
states[i].
+          int    n_stage_0;                 // the number of stages in 
states[0]
+          float  stage_scores_0[[n_stage_0] // the scores for all stages in 
states[0]
+          int    n_stage_1;                 // the number of stages in 
states[1]
+          float  stage_scores_1[n_stage_1]; // the scores for all stages in 
states[1]
+          ...
+          int    n_stage_i;                 // the number of stages in 
states[i]
+          float  stage_scores_1[n_stage_i]; // the scores for all stages in 
states[i]
+          ...  // untill i == N - 1
+        }
+        To implement this format, we also store int as float, so we can store 
all numbers
+        into a single float array.
+        """
+        features = get_per_store_features_from_states(states, task)
+        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
+            dtest, pack_ids = feature_to_pack_sum_xgbmatrix(features)
+            raw_preds = self.bst.predict(dtest)
+            breakdown = predict_throughput_pack_sum(raw_preds, pack_ids)
+            stage_scores = [[] for _ in range(len(states))]
+            for pred, pack_id in zip(raw_preds, pack_ids):
+                stage_scores[pack_id].append(pred)
+            for idx, stage_score in enumerate(stage_scores):
+                breakdown = np.append(breakdown, len(stage_score))
+                breakdown = np.concatenate((breakdown, np.array(stage_score)))
+        else:
+            breakdown = np.concatenate(
+                (np.random.uniform(0, 1, (len(states), )), 
np.zeros(len(states), )))
+
+        # Predict 0 for invalid states that failed to be lowered.
+        for idx, feature in enumerate(features):
+            if feature.min() == feature.max() == 0:
+                breakdown[idx] = float('-inf')
+
+        return breakdown
+
+    def load_log_file(self, file_name, n_lines=None):
+        """Load measure records from a log file to pre-train the cost model
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        n_lines: int
+            Only
+        """
+        inputs, results = RecordReader(file_name).read_lines(n_lines)
+        logger.info("XGBModel: Loaded %s measurement records from %s", 
len(inputs), file_name)
+        self.update(inputs, results)
+
+    def save(self, file_name: str):
+        """Save the model to a file
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        """
+        self.bst.save_model(file_name)
+
+    def load(self, file_name: str):
+        """Load the model from a file
+
+        Parameters
+        ----------
+        file_name: str
+            The filename
+        """
+        if self.bst is None:
+            self.bst = xgb.Booster(self.xgb_params)
+        self.bst.load_model(file_name)
+        self.num_warmup_sample = -1
+
+
+def feature_to_pack_sum_xgbmatrix(xs):
+    """Convert an extracted multi-stage feature vector to a xgbmatrx in 
pack-sum format
+
+    Parameters
+    ----------
+    xs: np.ndarray
+        The feature vector
+
+    Returns
+    -------
+    dmatrix: xgb.DMatrix
+        The DMatrix
+    pack_ids: List[int]
+        pack ids information
+    """
+    x_flatten = []
+    pack_ids = []
+
+    for ct, x in enumerate(xs):
+        for row in x:
+            x_flatten.append(row)
+            pack_ids.append(ct)
+
+    return xgb.DMatrix(np.array(x_flatten)), pack_ids
+
+
+def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None):
+    """Convert (feature, label) pairs into a xgb matrix with pack-sum format
+
+    Parameters
+    ----------
+    xs: np.ndarray
+        The feature vector
+    ys: np.ndarray
+        The normaizlied throughput
+    gids: Optional[List[int]]
+        Group id (task id)
+    weights: Optional[np.ndarray]
+        The weight of samples
+
+    Returns
+    -------
+    dmatrix: xgb.DMatrix
+        The DMatrix with pack-sum information
+    """
+    if gids is not None:
+        # sort by group
+        indices = gids.argsort()
+        xs, ys = xs[indices], ys[indices]
+        group_sizes = np.bincount(gids)
+        if weights is not None:
+            weights = weights[indices]
+    else:
+        # assume it has only one group
+        group_sizes = [len(xs)]
+
+    x_flatten = []
+    y_flatten = []
+    weights_flatten = []
+    pack_ids = []
+
+    if weights is not None:
+        for ct, (x, y, w) in enumerate(zip(xs, ys, weights)):
+            for row in x:
+                x_flatten.append(row)
+                y_flatten.append(y)
+                weights_flatten.append(w)
+                pack_ids.append(ct)
+    else:
+        for ct, (x, y) in enumerate(zip(xs, ys)):
+            for row in x:
+                x_flatten.append(row)
+                y_flatten.append(y)
+                pack_ids.append(ct)
+
+    ret = xgb.DMatrix(np.array(x_flatten), y_flatten)
+    if weights is not None:
+        ret.set_weight(weights_flatten)
+    dmatrix_context.set('pack_ids', ret, np.array(pack_ids))
+    dmatrix_context.set('group_sizes', ret, group_sizes)
+    return ret
+
+
+def predict_throughput_pack_sum(raw_preds, pack_ids):
+    """Predict the throughputs for predictions in pack-sum format
+
+    Parameters
+    ----------
+    raw_preds: np.ndarray
+        The raw predictions
+    pack_ids: List[int]
+        The pack id for predictions
+
+    Returns
+    -------
+    throughputs: np.ndarray
+        The throughput
+    """
+    sum_pred = np.bincount(pack_ids, weights=raw_preds)
+    return sum_pred
+
+def pack_sum_square_error(preds, dtrain):
+    """Implement square error loss on pack-sum format as
+     a custom objective function for xgboost.
+
+    Parameters
+    ----------
+    preds: np.ndarray
+        The predicitons
+    dtrain: xgb.DMatrix
+        The training set
+
+    Returns
+    -------
+    gradient and hessian
+    """
+    pack_ids = dmatrix_context.get("pack_ids", dtrain)
+    weight = dtrain.get_weight()
+
+    sum_pred = np.bincount(pack_ids, weights=preds)
+    x = sum_pred[pack_ids]
+    y = dtrain.get_label()
+    gradient = x - y
+    hessian = np.ones_like(gradient)
+
+    if len(weight) == 0:
+        return gradient, hessian
+
+    return gradient * weight, hessian * weight
+
+def pack_sum_rmse(raw_preds, labels):
+    """Evaluate RMSE (rooted mean square error) in the pack-sum format
+
+    Parameters
+    ----------
+    raw_preds: np.ndarray
+        The raw prediction
+    labels: xgb.DMatrix
+        The groud-truth label matrix
+
+    Returns
+    -------
+    The name and value of the metric

Review comment:
       Type missing.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [incubator-tvm] comaniac commented on a change in pull request #6270: [Ansor][AutoTVM v2.0] Phase 1: XGBoost Cost Model

Reply via email to