diff --git a/examples/sparse/README.md b/examples/sparse/README.md new file mode 100644 index 00000000..c0cd3efc --- /dev/null +++ b/examples/sparse/README.md @@ -0,0 +1,3 @@ +# Sparse data + +This example trains a youtube comment spam classifier on a sparse dataset. The comments as raw strings are converted to a sparse matrix of word counts using the `CountVectorizer` from scikit-learn. diff --git a/examples/sparse/sparse.py b/examples/sparse/sparse.py new file mode 100644 index 00000000..525376c6 --- /dev/null +++ b/examples/sparse/sparse.py @@ -0,0 +1,50 @@ +import pandas as pd +from sklearn.datasets import fetch_openml +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.model_selection import train_test_split + +import legateboost as lb + +# Alberto, T. & Lochter, J. (2015). YouTube Spam Collection [Dataset]. +# UCI Machine Learning Repository. https://doi.org/10.24432/C58885. +dataset_names = [ + "youtube-spam-psy", + "youtube-spam-shakira", + "youtube-spam-lmfao", + "youtube-spam-eminem", + "youtube-spam-katyperry", +] +X = [] +for dataset_name in dataset_names: + dataset = fetch_openml(name=dataset_name, as_frame=True) + X.append(dataset.data) + +X = pd.concat(X) +X.reset_index(drop=True, inplace=True) +y = X["CLASS"] +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 +) +vectorizer = CountVectorizer() +X_train_vectorized = vectorizer.fit_transform(X_train["CONTENT"]) +X_test_vectorized = vectorizer.transform(X_test["CONTENT"]) + +model = lb.LBClassifier().fit( + X_train_vectorized, y_train, eval_set=[(X_test_vectorized, y_test)] +) + +example_a = X_test.iloc[15]["CONTENT"] +print("Comment: {}".format(example_a)) +print( + "Probability of spam: {}".format( + model.predict_proba(vectorizer.transform([example_a]))[0, 1] + ) +) + +example_b = X_test.iloc[3]["CONTENT"] +print("Comment: {}".format(example_b)) +print( + "Probability of spam: {}".format( + model.predict_proba(vectorizer.transform([example_b]))[0, 1] + ) +) diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index 8945205b..c2f224fa 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -1339,8 +1339,8 @@ struct build_tree_csr_fn { auto [h, h_shape, h_accessor] = GetInputStore(context.input(4).data()); auto num_rows = std::max(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0); - auto num_outputs = g_shape.hi[1] - g_shape.lo[1] + 1; - EXPECT(g_shape.lo[1] == 0, "Outputs should not be split between workers."); + auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1; + EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers."); // Scalars auto max_depth = context.scalars().at(0).value();