Usage examples

Load the dataset from list, ndarray, pandas.DataFrame, pandas.Series

Dataset with categorical features
from catboost import Pool
cat_features = [0,1,2]
data = [["a","b",1,4,5,6],["a","b",4,5,6,7],["c","d",30,40,50,60]]
label = [1,1,-1]
p = Pool(data, label, cat_features)
Dataset without categorical features
from catboost import Pool
data = [[1,4,5,6],[4,5,6,7],[30,40,50,60]]
label = [1,1,-1]
p = Pool(data, label)
Dataset without labels (for prediction)
from catboost import Pool
data = [[1,4,5,6],[4,5,6,7],[30,40,50,60]]
p = Pool(data)

Load the dataset from a file

Empty dataset
from catboost import Pool
p = Pool(None)
Dataset with specified column descriptions
from catboost import Pool
p = Pool(DATA_FILE, column_description=CD_FILE)
Dataset with no column descriptions specified
from catboost import Pool
p = Pool(DATA_FILE)

DATA_FILE is the file with the object descriptions. It is assumed that the first column (indexed 0) defines the value of label value, and the other columns are the values of numerical features.

Binary classification

CatBoostClassifier class with array_like data
from catboost import CatBoostClassifier
# Initialize data
cat_features = [0,1,2]
train_data = [["a","b",1,4,5,6],["a","b",4,5,6,7],["c","d",30,40,50,60]]
train_labels = [1,1,-1]
test_data = [["a","b",2,4,6,8],["a","d",1,4,50,60]]
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, loss_function='Logloss')
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(test_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')
CatBoostClassifier class with input files
from catboost import Pool, CatBoostClassifier

TRAIN_FILE = '../data/adult/train_small'
TEST_FILE = '../data/adult/test_small'
CD_FILE = '../data/adult/train.cd'
# Load data from files to Pool
train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
test_pool = Pool(TEST_FILE, column_description=CD_FILE)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, loss_function='Logloss')
# Fit model
model.fit(train_pool)
# Get predicted classes
preds_class  = model.predict(test_pool)
# Get predicted probabilities for each class
preds_proba  = model.predict_proba(test_pool)
# Get predicted RawFormulaVal
preds_raw  = model.predict(test_pool, prediction_type='RawFormulaVal')                            
CatBoost class with an input files
from catboost import Pool, CatBoost

TRAIN_FILE = '../data/adult/train_small'
TEST_FILE = '../data/adult/test_small'
CD_FILE = '../data/adult/train.cd'
# Load data from files to Pool
train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
test_pool = Pool(TEST_FILE, column_description=CD_FILE)
# Initialize params
params = {'iterations':2, 'learning_rate':1, 'depth':2, 'loss_function':'Logloss'}
# Initialize CatBoost
model = CatBoost(params)
# Fit model
model.fit(train_pool)
# Get predicted classes
preds_class = model.predict(test_pool, prediction_type="Class")
# Get predicted probabilities for each class
preds_proba = model.predict(test_pool, prediction_type="Probability")
# Get predicted RawFormulaVal
preds_raw = model.predict(test_pool, prediction_type="RawFormulaVal")

Multiclassification

CatBoostClassifier class with input files
from catboost import Pool, CatBoostClassifier

TRAIN_FILE = '../data/cloudness_small/train_small'
TEST_FILE = '../data/cloudness_small/test_small'
CD_FILE = '../data/cloudness_small/train.cd'
# Load data from files to Pool
train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
test_pool = Pool(TEST_FILE, column_description=CD_FILE)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, loss_function='MultiClass')
# Fit model
model.fit(train_pool)
# Get predicted classes
preds_class = model.predict(test_pool)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_pool)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_pool, prediction_type='RawFormulaVal')                            
CatBoostClassifier with FeaturesData
import numpy as np
from catboost import CatBoostClassifier, FeaturesData
# Initialize data
cat_features = [0,1,2]
train_data = FeaturesData(
    num_feature_data=np.array([[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]], dtype=np.float32),
    cat_feature_data=np.array([[b"a", b"b"], [b"a", b"b"], [b"c", b"d"]], dtype=object)
)
train_labels = [1,1,-1]
test_data = FeaturesData(
    num_feature_data=np.array([[2, 4, 6, 8], [1, 4, 50, 60]], dtype=np.float32),
    cat_feature_data=np.array([[b"a", b"b"], [b"a", b"d"]], dtype=object)
)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, loss_function='Logloss')
# Fit model
model.fit(train_data, train_labels)
# Get predicted classes
preds_class = model.predict(test_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')
CatBoostClassifier with Pool and FeaturesData
import numpy as np
from catboost import CatBoostClassifier, FeaturesData, Pool
# Initialize data
train_data = Pool(
    data=FeaturesData(
        num_feature_data=np.array([[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60]], dtype=np.float32),
        cat_feature_data=np.array([[b"a", b"b"], [b"a", b"b"], [b"c", b"d"]], dtype=object)
    ),
    label=[1, 1, -1]
)
test_data = Pool(
    data=FeaturesData(
        num_feature_data=np.array([[2, 4, 6, 8], [1, 4, 50, 60]], dtype=np.float32),
        cat_feature_data=np.array([[b"a", b"b"], [b"a", b"d"]], dtype=object)
    )
)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, loss_function='Logloss')
# Fit model
model.fit(train_data)
# Get predicted classes
preds_class = model.predict(test_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')

Regression

CatBoostRegressor class with array_like data
from catboost import CatBoostRegressor
# Initialize data
cat_features = [0,1,2]
train_data = [["a","b",1,4,5,6],["a","b",4,5,6,7],["c","d",30,40,50,60]]
test_data = [["a","b",2,4,6,8],["a","d",1,4,50,60]]
train_labels = [10,20,30]
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=2, learning_rate=1, depth=2)
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predictions
preds = model.predict(test_data)
CatBoost class with an input file
from catboost import Pool, CatBoostRegressor

TRAIN_FILE = '../data/adult/train_small'
TEST_FILE = '../data/adult/test_small'
CD_FILE = '../data/adult/train.cd'
# Load data from files to Pool
train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
test_pool = Pool(TEST_FILE, column_description=CD_FILE)
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=2, learning_rate=1, depth=2)
# Fit model
model.fit(train_pool)
# Get predictions
preds = model.predict(test_pool)

CV

Perform cross-validation on the given dataset:
from catboost import Pool, cv

pool = Pool(x_train, y_train)
params = {'iterations': 100, 
          'depth': 2, 
          'loss_function': 'MultiClass', 
          'classes_count': 3, 
          'verbose': False}
scores = cv(pool, params)
Perform cross-validation and save ROC curve points to the roc-curve output file:
from catboost import Pool, cv

input_pool = Pool("/home/ironman/catboost/pytest/data/adult/train_small", 
column_description="/home/ironman/catboost/pytest/data/adult/train.cd")
params = {'iterations': 100, 
          'depth': 2, 
          'loss_function': 'Logloss', 
          'verbose': False, 
          'roc_file': 'roc-file'}
scores = cv(input_pool, params)

Using pre-training results (baseline)

A pre-trained model can be used. The results (only raw_values, not probability or class) can be set as baseline for the new model.

The form of the baseline depends on the machine learning problem being solved:
  • Classification — a two-dimensional array: shape = (length of data, number of class)
  • Regression — a one-dimensional array.
import numpy as np
from catboost import Pool, CatBoostClassifier

TRAIN_FILE = '../data/cloudness_small/train_small'
EVAL_FILE = '../data/cloudness_small/test_small'
CD_FILE = '../data/cloudness_small/train.cd'
# Load data from files to Pool
pool = Pool(TRAIN_FILE, column_description=CD_FILE)
eval_pool = Pool(EVAL_FILE, column_description=CD_FILE)
# Initialize CatBoostClassifier base_model to predict baselines
base_model = CatBoostClassifier(iterations=100, loss_function="MultiClass", partition_random_seed=0)
# Fit model
base_model.fit(pool)
# Get baselines
baseline = base_model.predict(pool, prediction_type='RawFormulaVal')
eval_baseline = base_model.predict(eval_pool, prediction_type='RawFormulaVal')
# Set baselines to Pools
pool.set_baseline(baseline)
eval_pool.set_baseline(eval_baseline)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100, partition_random_seed=0, loss_function="MultiClass")
# Fit model
model.fit(pool, eval_set=eval_pool)
# Get predicted classes
preds_class = model.predict(eval_pool)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_pool)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_pool, prediction_type='RawFormulaVal')

Using object weights

The weight for each object in the input data can be set in the form of a one-dimensional array-like data (length = data length).

The weight is used for calculating the final values of the trees. By default, it is set to 1 for all objects.

import numpy as np
from catboost import Pool, CatBoostClassifier
# Initialize data
data = np.random.randint(1, 100, size=(100, 10))
label = np.random.randint(2, size=(100))
weight = np.random.random(100)
test_data = np.random.randint(1, 100, size=(100, 10))
# Initialize Pool from data
pool = Pool(data, label, weight=weight)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=10, partition_random_seed=0)
# Fit model
model.fit(pool)
# Get predicted classes
preds_class = model.predict(test_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')

Using best model

If this parameter is set, the number of trees that are saved in the resulting model is defined as follows:
  1. Build the number of trees defined by the training parameters.
  2. Use the validation dataset to identify the iteration with the optimal value of the metric specified in  --eval-metric (eval_metric).

No trees are saved after this iteration.

The eval_set parameter is obligatory for the fit method if the best model mode is on.

eval_set should have the same values for the following training dataset parameters:
  • cat_features
  • baseline (if set).
from catboost import Pool, CatBoostClassifier

TRAIN_FILE = '../data/cloudness_small/train_small'
TEST_FILE = '../data/cloudness_small/test_small'
CD_FILE = '../data/cloudness_small/train.cd'
# Load data from files to Pool
train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
eval_pool = Pool(TEST_FILE, column_description=CD_FILE)
test_pool = Pool(TEST_FILE, column_description=CD_FILE)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100, partition_random_seed=0, loss_function="MultiClass")
# Fit model with `use_best_model=True`
model.fit(train_pool, use_best_model=True, eval_set=eval_pool)
# Get predicted classes
preds_class = model.predict(test_pool)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_pool)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_pool, prediction_type='RawFormulaVal')
print("Count of trees in model = {}".format(model.tree_count_))

Using staged_predict

The values of the model can be output for each i-th tree of the model by taking into consideration only the trees in the range [1;i].

This feature is implemented via the staged_predict method (available in the CatBoostClassifier and CatBoostRegressor classes).

from catboost import Pool, CatBoostClassifier

TRAIN_FILE = '../data/cloudness_small/train_small'
TEST_FILE = '../data/cloudness_small/test_small'
CD_FILE = '../data/cloudness_small/train.cd'
# Load data from files to Pool
train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
test_pool = Pool(TEST_FILE, column_description=CD_FILE)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100, partition_random_seed=0, loss_function="MultiClass")
# Fit model
model.fit(train_pool)
  # Get staged_predictions
  staged_predictions = list(model.staged_predict(test_pool))
  # It is equivalent to use predict() with `ntree_end` in loop, but faster
  staged_predictions = []
  for i in range(1, model.tree_count_ + 1):
      staged_predictions.append(model.predict(test_pool, ntree_end=i))

Load a model from file

from catboost import CatBoostClassifier, Pool

train_data = [[1,3],
              [0,4],
              [1,7]]
train_labels = [1,0,1]
catboost_pool = Pool(train_data, train_labels)
model = CatBoostClassifier(learning_rate=0.03)
model.fit(train_data, train_labels, verbose=False)
# Export the model JSON
model.save_model("model_json", format = "json", export_parameters=None)

from_file = CatBoostClassifier()
# Load the model from JSON
from_file.load_model("model_json", format = 'json')

Custom objective function

A custom objective function can be used by specifying a python object as the value for the loss_function parameter. In this case the objective is always maximized.

Depending on the machine learning problem the python object should have one the following functions defined:
  • calc_ders_range
  • calc_ders_multi(approxes, target, weight) (for multiclassification)
import math
from catboost import Pool, CatBoostClassifier

class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats (containers with only __len__ and __getitem__ defined).
        # weights parameter can be None.
        # Returns list of pairs (der1, der2)
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        exponents = []
        for index in xrange(len(approxes)):
            exponents.append(math.exp(approxes[index]))

        result = []
        for index in xrange(len(targets)):
            p = exponents[index] / (1 + exponents[index])
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))

        return result

TRAIN_FILE = '../data/adult/train_small'
TEST_FILE = '../data/adult/test_small'
CD_FILE = '../data/adult/train.cd'
# Load data from files to Pool
train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
test_pool = Pool(TEST_FILE, column_description=CD_FILE)
# Initialize CatBoostClassifier with custom `loss_function`
model = CatBoostClassifier(partition_random_seed=0, loss_function=LoglossObjective(), eval_metric="Logloss")
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulVal' allowed with custom `loss_function`
preds_raw = model.predict(test_pool, prediction_type='RawFormulaVal') 

Custom metric for overfitting detector and best model selection

To set a custom metric for overfitting detector and best model selection:
  1. Create an object that implements the following interface:
    class CustomMetric(object):
        def get_final_error(self, error, weight):
            return 0.0
    
        def is_max_optimal(self):
            return True
    
        def evaluate(self, approxes, target, weight):
            # approxes - list of list-like objects (one object per approx dimension)
            # target - list-like object
            # weight - list-like object, can be None
            return 0.0, 0.0
    The following is an example of the Logloss function implementation:
    import math
    from catboost import Pool, CatBoostClassifier
    
    class LoglossMetric(object):
        def get_final_error(self, error, weight):
            return error / (weight + 1e-38)
    
        def is_max_optimal(self):
            return True
    
        def evaluate(self, approxes, target, weight):
            # approxes is list of indexed containers (containers with only __len__ and __getitem__ defined), one container
            # per approx dimension. Each container contains floats.
            # weight is one dimensional indexed container.
            # target is float.
            
            # weight parameter can be None.
            # Returns pair (error, weights sum)
            
            assert len(approxes) == 1
            assert len(target) == len(approxes[0])
    
            approx = approxes[0]
    
            error_sum = 0.0
            weight_sum = 0.0
    
            for i in xrange(len(approx)):
                w = 1.0 if weight is None else weight[i]
                weight_sum += w
                error_sum += w * (target[i] * approx[i] - math.log(1 + math.exp(approx[i])))
    
            return error_sum, weight_sum
    
    TRAIN_FILE = '../data/adult/train_small'
    TEST_FILE = '../data/adult/test_small'
    CD_FILE = '../data/adult/train.cd'
    # Load data from files to Pool
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    eval_pool = Pool(TEST_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    # Initialize CatBoostClassifier with custom `eval_metric`
    model = CatBoostClassifier(iterations=5, partition_random_seed=0, eval_metric=LoglossMetric())
    # Fit model with `use_best_model=True`
    model.fit(train_pool, use_best_model=True, eval_set=eval_pool)
    # Get predictions
    pred = model.predict(test_pool)
  2. Pass the created object to the eval_metric parameter:
    CatBoostClassifier(eval_metric=CustomMetric())
    Note. Other training parameters are omitted in this example.

Exporting the model to Apple CoreML

To export the model to Apple CoreML for further usage on iOS devices:
  1. Train the model and save it in CoreML format.

    For example, if training on the Iris dataset:
    import catboost
    from sklearn import datasets
    
    iris = datasets.load_iris()
    cls = catboost.CatBoostClassifier(loss_function='MultiClass')
    
    cls.fit(iris.data, iris.target)
    
    # Save model to catboost format
    cls.save_model("iris.mlmodel", format="coreml", export_parameters={'prediction_type': 'probability'})
  2. Import the resulting model to XCode.

    The following is an example of importing with Swift:
    import CoreML
    
    let model = iris()
    let sepal_l = 7.0
    let sepal_w = 3.2
    let petal_l = 4.7
    let petal_w = 1.4
    
    guard let output = try? model.prediction(input: irisInput(feature_0: sepal_l, feature_1: sepal_w, feature_2: petal_l, feature_3: petal_w)) else {
        fatalError("Unexpected runtime error.")
    }
    
    print(String(
        format: "Output probabilities: %1.5f; %1.5f; %1.5f",
        output.prediction[0].doubleValue,
        output.prediction[1].doubleValue,
        output.prediction[2].doubleValue
    ))

Object strength calculation

Calculate the train_pool objects strength for the pool objects:
from catboost import CatBoost

cb = CatBoost({'random_seed': 0, 'iterations': 10})
cb.fit(train_pool)
indices, scores = cb.get_object_importance(
    pool,
    train_pool,
    top_size=100,
    ostr_type='Average',
    update_method='SinglePoint',
    thread_count=-1)

Get a slice of a pool

Get a slice of five objects from the input dataset:
from catboost import Pool

train_data = [[1,3],
              [0,4],
              [1,7],
              [6,4],
              [5,3]]

input_pool = Pool(train_data)
print input_pool.num_row()

pool_part = input_pool.slice([0,1,2])
print pool_part.num_row()
Output:
5
3

Train on GPU

Train a classification model on GPU:
from catboost import CatBoostClassifier

train_data = [[0,3],
              [4,1],
              [8,1],
              [9,1]]
train_labels = [0,0,1,1]

model = CatBoostClassifier(iterations=1000, task_type = "GPU")
model.fit(train_data, train_labels, verbose = False)

Get the best result for each metric

Return the best results for each metric calculated on each input dataset:

from catboost import CatBoostClassifier, Pool

train_data = [[0,3],
              [4,1],
              [8,1],
              [9,1]]

train_labels = [0,0,1,1]

eval_data = [[2,1],
            [3,1],
            [9,0],
            [5,3]]

evalution_pool = Pool(eval_data)

model = CatBoostClassifier(learning_rate = 0.03, custom_metric = ['Logloss', 'AUC:hints=skip_train~false'])

model.fit(train_data, train_labels, eval_set=evalution_pool, verbose=False)

print model.get_best_score()
Note. This example illustrates the usage of the method with the CatBoostClassifier class. The usage with other classes is identical.

Get the identifier of the iteration with the best result

Return the identifier of the iteration with the best result of the evaluation metric or loss function on the last validation set:

from catboost import CatBoostClassifier, Pool

train_data = [[0,3],
              [4,1],
              [8,1],
              [9,1]]

train_labels = [0,0,1,1]

eval_data = [[2,1],
            [3,1],
            [9,0],
            [5,3]]

evalution_pool = Pool(eval_data)

model = CatBoostClassifier(learning_rate = 0.03, custom_metric = ['Logloss', 'AUC:hints=skip_train~false'])

model.fit(train_data, train_labels, eval_set=evalution_pool, verbose=False)

print model.get_best_iteration()
Note. This example illustrates the usage of the method with the CatBoostClassifier class. The usage with other classes is identical.