Skip to content

Commit 992dbbd

Browse files
authored
migrate notebooks to use openml (#3121)
* migrate notebooks to use openml * remove multiclass flag due to dropped sklearn support * load svc dataset as dense * nan/inf fixes in svc
1 parent 15bb5f8 commit 992dbbd

14 files changed

Lines changed: 180 additions & 274 deletions

.ci/pipeline/nightly.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ jobs:
7575
pip install -r dependencies-dev
7676
pip install -r requirements-doc.txt
7777
pip install -r requirements-test.txt
78-
pip install jupyter matplotlib requests
78+
pip install jupyter matplotlib requests openml
7979
displayName: 'Install requirements'
8080
- script: |
8181
. /usr/share/miniconda/etc/profile.d/conda.sh

examples/notebooks/ElasticNet.ipynb

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 1,
13+
"execution_count": null,
1414
"id": "b1b922d1",
1515
"metadata": {},
1616
"outputs": [],
@@ -19,7 +19,7 @@
1919
"from sklearn import metrics\n",
2020
"from sklearn.model_selection import train_test_split\n",
2121
"import warnings\n",
22-
"from sklearn.datasets import fetch_openml\n",
22+
"import openml\n",
2323
"from sklearn.preprocessing import LabelEncoder\n",
2424
"from IPython.display import HTML\n",
2525
"\n",
@@ -36,12 +36,20 @@
3636
},
3737
{
3838
"cell_type": "code",
39-
"execution_count": 2,
39+
"execution_count": null,
4040
"id": "00c2277b",
4141
"metadata": {},
4242
"outputs": [],
4343
"source": [
44-
"x, y = fetch_openml(name=\"Airlines_DepDelay_10M\", return_X_y=True)"
44+
"dataset = openml.datasets.get_dataset(\n",
45+
" \"Airlines_DepDelay_10M\",\n",
46+
" download_data=True,\n",
47+
" download_qualities=False,\n",
48+
" download_features_meta_data=False,\n",
49+
")\n",
50+
"x, y, _, _ = dataset.get_data(\n",
51+
" dataset_format=\"dataframe\", target=dataset.default_target_attribute\n",
52+
")"
4553
]
4654
},
4755
{

examples/notebooks/dbscan.ipynb

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 1,
13+
"execution_count": null,
1414
"id": "b1b922d1",
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
1818
"from timeit import default_timer as timer\n",
1919
"from sklearn.model_selection import train_test_split\n",
2020
"from sklearn.metrics import davies_bouldin_score\n",
21-
"from sklearn.datasets import fetch_openml\n",
21+
"import openml\n",
2222
"from IPython.display import HTML\n",
2323
"import warnings\n",
2424
"\n",
@@ -35,12 +35,20 @@
3535
},
3636
{
3737
"cell_type": "code",
38-
"execution_count": 2,
38+
"execution_count": null,
3939
"id": "7e73dc65",
4040
"metadata": {},
4141
"outputs": [],
4242
"source": [
43-
"x, y = fetch_openml(name=\"spoken-arabic-digit\", return_X_y=True)"
43+
"dataset = openml.datasets.get_dataset(\n",
44+
" \"spoken-arabic-digit\",\n",
45+
" download_data=True,\n",
46+
" download_qualities=False,\n",
47+
" download_features_meta_data=False,\n",
48+
")\n",
49+
"x, y, _, _ = dataset.get_data(\n",
50+
" dataset_format=\"dataframe\", target=dataset.default_target_attribute\n",
51+
")"
4452
]
4553
},
4654
{

examples/notebooks/kmeans.ipynb

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 1,
13+
"execution_count": null,
1414
"id": "b1b922d1",
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
1818
"from timeit import default_timer as timer\n",
1919
"from sklearn.model_selection import train_test_split\n",
20-
"from sklearn.datasets import fetch_openml\n",
20+
"import openml\n",
2121
"from IPython.display import HTML\n",
2222
"import warnings\n",
2323
"\n",
@@ -34,12 +34,20 @@
3434
},
3535
{
3636
"cell_type": "code",
37-
"execution_count": 2,
37+
"execution_count": null,
3838
"id": "7e73dc65",
3939
"metadata": {},
4040
"outputs": [],
4141
"source": [
42-
"x, y = fetch_openml(name=\"spoken-arabic-digit\", return_X_y=True)"
42+
"dataset = openml.datasets.get_dataset(\n",
43+
" \"spoken-arabic-digit\",\n",
44+
" download_data=True,\n",
45+
" download_qualities=False,\n",
46+
" download_features_meta_data=False,\n",
47+
")\n",
48+
"x, y, _, _ = dataset.get_data(\n",
49+
" dataset_format=\"dataframe\", target=dataset.default_target_attribute\n",
50+
")"
4351
]
4452
},
4553
{

examples/notebooks/knn_mnist.ipynb

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 1,
13+
"execution_count": null,
1414
"id": "23512089",
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
1818
"from timeit import default_timer as timer\n",
1919
"from IPython.display import HTML\n",
2020
"from sklearn import metrics\n",
21-
"from sklearn.datasets import fetch_openml\n",
21+
"import openml\n",
2222
"from sklearn.model_selection import train_test_split"
2323
]
2424
},
@@ -32,12 +32,20 @@
3232
},
3333
{
3434
"cell_type": "code",
35-
"execution_count": 2,
35+
"execution_count": null,
3636
"id": "27b99b44",
3737
"metadata": {},
3838
"outputs": [],
3939
"source": [
40-
"x, y = fetch_openml(name=\"mnist_784\", return_X_y=True)"
40+
"dataset = openml.datasets.get_dataset(\n",
41+
" \"mnist_784\",\n",
42+
" download_data=True,\n",
43+
" download_qualities=False,\n",
44+
" download_features_meta_data=False,\n",
45+
")\n",
46+
"x, y, _, _ = dataset.get_data(\n",
47+
" dataset_format=\"dataframe\", target=dataset.default_target_attribute\n",
48+
")"
4149
]
4250
},
4351
{

examples/notebooks/logistictic_regression_cifar.ipynb

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,22 @@
3434
},
3535
{
3636
"cell_type": "code",
37-
"execution_count": 2,
37+
"execution_count": null,
3838
"id": "27b99b44",
3939
"metadata": {},
4040
"outputs": [],
4141
"source": [
42-
"from sklearn.datasets import fetch_openml\n",
42+
"import openml\n",
4343
"\n",
44-
"x, y = fetch_openml(name=\"CIFAR-100\", return_X_y=True)"
44+
"dataset = openml.datasets.get_dataset(\n",
45+
" \"CIFAR-100\",\n",
46+
" download_data=True,\n",
47+
" download_qualities=False,\n",
48+
" download_features_meta_data=False,\n",
49+
")\n",
50+
"x, y, _, _ = dataset.get_data(\n",
51+
" dataset_format=\"dataframe\", target=dataset.default_target_attribute\n",
52+
")"
4553
]
4654
},
4755
{
@@ -121,28 +129,16 @@
121129
},
122130
{
123131
"cell_type": "code",
124-
"execution_count": 5,
132+
"execution_count": null,
125133
"id": "e9b8f06b",
126134
"metadata": {},
127-
"outputs": [
128-
{
129-
"data": {
130-
"text/plain": [
131-
"'Extension for Scikit-learn time: 24.82 s'"
132-
]
133-
},
134-
"execution_count": 5,
135-
"metadata": {},
136-
"output_type": "execute_result"
137-
}
138-
],
135+
"outputs": [],
139136
"source": [
140137
"from sklearn.linear_model import LogisticRegression\n",
141138
"\n",
142139
"params = {\n",
143140
" \"C\": 0.1,\n",
144141
" \"solver\": \"lbfgs\",\n",
145-
" \"multi_class\": \"multinomial\",\n",
146142
" \"n_jobs\": -1,\n",
147143
"}\n",
148144
"start = timer()\n",

examples/notebooks/nusvr_medical_charges.ipynb

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 1,
13+
"execution_count": null,
1414
"id": "27b99b44",
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
1818
"from timeit import default_timer as timer\n",
19-
"from sklearn.datasets import fetch_openml\n",
19+
"import openml\n",
2020
"from sklearn.model_selection import train_test_split\n",
2121
"from IPython.display import HTML\n",
2222
"import warnings\n",
@@ -34,12 +34,20 @@
3434
},
3535
{
3636
"cell_type": "code",
37-
"execution_count": 2,
37+
"execution_count": null,
3838
"id": "a9b315cc",
3939
"metadata": {},
4040
"outputs": [],
4141
"source": [
42-
"x, y = fetch_openml(name=\"medical_charges_nominal\", return_X_y=True)"
42+
"dataset = openml.datasets.get_dataset(\n",
43+
" \"medical_charges_nominal\",\n",
44+
" download_data=True,\n",
45+
" download_qualities=False,\n",
46+
" download_features_meta_data=False,\n",
47+
")\n",
48+
"x, y, _, _ = dataset.get_data(\n",
49+
" dataset_format=\"dataframe\", target=dataset.default_target_attribute\n",
50+
")"
4351
]
4452
},
4553
{

0 commit comments

Comments
 (0)