diff --git a/docs/reference/notebooks/amazon_review_classification_sklearn.ipynb b/docs/reference/notebooks/amazon_review_classification_sklearn.ipynb index 836b41bf7b..d026d648a0 100644 --- a/docs/reference/notebooks/amazon_review_classification_sklearn.ipynb +++ b/docs/reference/notebooks/amazon_review_classification_sklearn.ipynb @@ -117,15 +117,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-08T20:55:58.032511Z", - "start_time": "2023-11-08T20:55:57.792680Z" - } - }, "outputs": [], + "execution_count": null, "source": [ "# Constants.\n", "RANDOM_SEED = 0\n", @@ -135,8 +130,8 @@ "TARGET_NAME = \"isHelpful\"\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/amazon_review_dataset/reviews.json\"\n", - "DATA_PATH = Path.home() / \".giskard\" / \"amazon_review_dataset\" / \"reviews.json\"" + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/amazon_review_dataset-reviews.json.tar.gz\"\n", + "DATA_PATH = Path.home() / \".giskard\" / \"amazon_review_dataset\" / \"reviews.json.tar.gz\"" ] }, { @@ -156,18 +151,12 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-08T20:56:00.293536Z", - "start_time": "2023-11-08T20:56:00.234306Z" - }, - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -181,7 +170,7 @@ "\n", "def download_data(**kwargs) -> pd.DataFrame:\n", " \"\"\"Download the dataset using URL.\"\"\"\n", - " fetch_from_ftp(DATA_URL, DATA_PATH)\n", + " fetch_demo_data(DATA_URL, DATA_PATH)\n", " _df = pd.read_json(DATA_PATH, lines=True, **kwargs)\n", " return _df\n", "\n", @@ -215,10 +204,10 @@ ] }, { - "cell_type": "code", - "execution_count": null, "metadata": {}, + "cell_type": "code", "outputs": [], + "execution_count": null, "source": [ "reviews_df = download_data()\n", "reviews_df = preprocess_data(reviews_df)" diff --git a/docs/reference/notebooks/drug_classification_sklearn.ipynb b/docs/reference/notebooks/drug_classification_sklearn.ipynb index 6c443f487b..c1d19f7d8f 100644 --- a/docs/reference/notebooks/drug_classification_sklearn.ipynb +++ b/docs/reference/notebooks/drug_classification_sklearn.ipynb @@ -97,17 +97,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 2, - "id": "d44430add2918aa1", - "metadata": { - "ExecuteTime": { - "end_time": "2024-02-09T09:29:15.513819Z", - "start_time": "2024-02-09T09:29:15.470284Z" - }, - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ "# Constants.\n", "RANDOM_SEED = 0\n", @@ -121,9 +114,10 @@ "NA_TO_K_CATEGORIES = ['<10', '10-20', '20-30', '>30']\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/drug_classification_dataset/drug200.csv\"\n", - "DATA_PATH = Path.home() / \".giskard\" / \"drug_classification_dataset\" / \"drug200.csv\"" - ] + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/drug_classification_dataset-drug200.csv.tar.gz\"\n", + "DATA_PATH = Path.home() / \".giskard\" / \"drug_classification_dataset\" / \"drug200.csv.tar.gz\"" + ], + "id": "a161e40415287e1f" }, { "cell_type": "markdown", @@ -158,7 +152,7 @@ }, "outputs": [], "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -172,7 +166,7 @@ "\n", "def load_data() -> pd.DataFrame:\n", " \"\"\"Load data.\"\"\"\n", - " fetch_from_ftp(DATA_URL, DATA_PATH)\n", + " fetch_demo_data(DATA_URL, DATA_PATH)\n", " df = pd.read_csv(DATA_PATH)\n", " return df\n", "\n", diff --git a/docs/reference/notebooks/fake_real_news_classification.ipynb b/docs/reference/notebooks/fake_real_news_classification.ipynb index cb74b26e6b..2c70e3d8cc 100644 --- a/docs/reference/notebooks/fake_real_news_classification.ipynb +++ b/docs/reference/notebooks/fake_real_news_classification.ipynb @@ -66,7 +66,9 @@ "source": [ "import os\n", "import string\n", + "import tarfile\n", "from pathlib import Path\n", + "from typing import Tuple, Callable\n", "from urllib.request import urlretrieve\n", "\n", "import numpy as np\n", @@ -79,7 +81,6 @@ "from nltk.corpus import stopwords\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import train_test_split\n", - "from typing import Tuple, Callable\n", "\n", "from giskard import Dataset, Model, scan, testing" ] @@ -142,7 +143,7 @@ "RANDOM_SEED = 0\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/fake_real_news_dataset/{}\"\n", + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/fake_real_news_dataset-{}\"\n", "DATA_PATH = Path.home() / \".giskard\" / \"fake_real_news_dataset\"" ] }, @@ -170,7 +171,7 @@ }, "outputs": [], "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -184,15 +185,15 @@ "\n", "def fetch_dataset() -> None:\n", " \"\"\"Gradually fetch all necessary files from the FTP server.\"\"\"\n", - " files_to_fetch = (\"Fake.csv\", \"True.csv\", \"glove_100d.txt\")\n", + " files_to_fetch = (\"Fake.csv.tar.gz\", \"True.csv.tar.gz\", \"glove_100d.txt.tar.gz\")\n", " for file_name in files_to_fetch:\n", - " fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name)\n", + " fetch_demo_data(DATA_URL.format(file_name), DATA_PATH / file_name)\n", "\n", "\n", "def load_data(**kwargs) -> pd.DataFrame:\n", " \"\"\"Load data.\"\"\"\n", - " real_df = pd.read_csv(DATA_PATH / \"True.csv\", **kwargs)\n", - " fake_df = pd.read_csv(DATA_PATH / \"Fake.csv\", **kwargs)\n", + " real_df = pd.read_csv(DATA_PATH / \"True.csv.tar.gz\", **kwargs)\n", + " fake_df = pd.read_csv(DATA_PATH / \"Fake.csv.tar.gz\", **kwargs)\n", "\n", " # Create target column.\n", " real_df[TARGET_COLUMN_NAME] = 0\n", @@ -380,7 +381,7 @@ "def get_embeddings_matrix() -> np.ndarray:\n", " \"\"\"Create matrix, where each row is an embedding of a specific word.\"\"\"\n", " # Load glove embeddings.\n", - " embeddings_dict = dict(parse_line(*line.rstrip().rsplit(' ')) for line in open(DATA_PATH / \"glove_100d.txt\"))\n", + " embeddings_dict = dict(parse_line(*line.rstrip().rsplit(' ')) for line in tarfile.open(DATA_PATH / \"glove_100d.txt.tar.gz\", \"r:gz\").extractfile(\"fake_real_news_dataset-glove_100d.txt\").read().decode())\n", "\n", " # Create embeddings matrix with glove word vectors.\n", " embeddings_matrix = init_embeddings_matrix(embeddings_dict)\n", diff --git a/docs/reference/notebooks/hotel_text_regression.ipynb b/docs/reference/notebooks/hotel_text_regression.ipynb index 74d4538f7a..aa5cfaa90b 100644 --- a/docs/reference/notebooks/hotel_text_regression.ipynb +++ b/docs/reference/notebooks/hotel_text_regression.ipynb @@ -91,24 +91,18 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-09T12:12:05.303464Z", - "start_time": "2023-11-09T12:12:05.254149Z" - }, - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ "# Constants.\n", "FEATURE_COLUMN_NAME = \"Full_Review\"\n", "TARGET_COLUMN_NAME = \"Reviewer_Score\"\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/hotel_text_regression_dataset/Hotel_Reviews.csv\"\n", - "DATA_PATH = Path.home() / \".giskard\" / \"hotel_text_regression_dataset\" / \"Hotel_Reviews.csv\"" + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/hotel_text_regression_dataset-Hotel_Reviews.csv.tar.gz\"\n", + "DATA_PATH = Path.home() / \".giskard\" / \"hotel_text_regression_dataset\" / \"Hotel_Reviews.csv.tar.gz\"" ] }, { @@ -142,7 +136,7 @@ }, "outputs": [], "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -155,7 +149,7 @@ "\n", "\n", "def load_data(**kwargs) -> pd.DataFrame:\n", - " fetch_from_ftp(DATA_URL, DATA_PATH)\n", + " fetch_demo_data(DATA_URL, DATA_PATH)\n", " df = pd.read_csv(DATA_PATH, **kwargs)\n", "\n", " # Create target column.\n", diff --git a/docs/reference/notebooks/ieee_fraud_detection_adversarial_validation.ipynb b/docs/reference/notebooks/ieee_fraud_detection_adversarial_validation.ipynb index 3e98100999..656659ed89 100644 --- a/docs/reference/notebooks/ieee_fraud_detection_adversarial_validation.ipynb +++ b/docs/reference/notebooks/ieee_fraud_detection_adversarial_validation.ipynb @@ -102,23 +102,17 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-09T12:17:44.751420Z", - "start_time": "2023-11-09T12:17:44.719440Z" - }, - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ "# Constants.\n", "TARGET_COLUMN = 'isTest'\n", "IDX_LABEL = 'TransactionID'\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/fraud_detection_classification_dataset/{}\"\n", + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/fraud_detection_classification_dataset-{}\"\n", "DATA_PATH = Path.home() / \".giskard\" / \"fraud_detection_classification_dataset\"" ] }, @@ -141,18 +135,12 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-09T12:17:45.925766Z", - "start_time": "2023-11-09T12:17:45.904823Z" - }, - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -165,9 +153,9 @@ "\n", "\n", "def fetch_dataset():\n", - " files_to_fetch = [\"train_transaction.csv\", \"train_identity.csv\", \"test_transaction.csv\", \"test_identity.csv\"]\n", + " files_to_fetch = [\"train_transaction.csv.tar.gz\", \"train_identity.csv.tar.gz\", \"test_transaction.csv.tar.gz\", \"test_identity.csv.tar.gz\"]\n", " for file_name in files_to_fetch:\n", - " fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name)\n", + " fetch_demo_data(DATA_URL.format(file_name), DATA_PATH / file_name)\n", "\n", "\n", "# Define data-types of transactions features.\n", @@ -225,11 +213,11 @@ "def read_set(_type):\n", " \"\"\"Read both transactions and identity data.\"\"\"\n", " print(f\"Reading transactions data...\")\n", - " _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv'),\n", + " _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv.tar.gz'),\n", " index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=250)\n", "\n", " print(f\"Reading identity data...\")\n", - " _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv'),\n", + " _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv.tar.gz'),\n", " index_col=IDX_LABEL, dtype=DATA_TYPES_ID))\n", " return _df\n", "\n", @@ -248,16 +236,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-09T12:17:46.316557Z", - "start_time": "2023-11-09T12:17:46.290804Z" - }, - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ "def preprocess_dataset(train_set, test_set):\n", " \"\"\"Unite train and test into common dataframe.\"\"\"\n", diff --git a/docs/reference/notebooks/insurance_prediction_lgbm.ipynb b/docs/reference/notebooks/insurance_prediction_lgbm.ipynb index 162e15d080..ad7467cd04 100644 --- a/docs/reference/notebooks/insurance_prediction_lgbm.ipynb +++ b/docs/reference/notebooks/insurance_prediction_lgbm.ipynb @@ -179,8 +179,8 @@ "CATEGORICAL_COLS = [\"sex\", \"smoker\", \"region\"]\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/insurance_prediction_dataset/us_health_insurance_dataset.csv\"\n", - "DATA_PATH = Path.home() / \".giskard\" / \"insurance_prediction_dataset\" / \"us_health_insurance_dataset.csv\"" + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/insurance_prediction_dataset-us_health_insurance_dataset.csv.tar.gz\"\n", + "DATA_PATH = Path.home() / \".giskard\" / \"insurance_prediction_dataset\" / \"us_health_insurance_dataset.csv.tar.gz\"" ] }, { @@ -216,7 +216,7 @@ }, "outputs": [], "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -230,7 +230,7 @@ "\n", "def download_data(**kwargs) -> pd.DataFrame:\n", " \"\"\"Download the dataset using URL.\"\"\"\n", - " fetch_from_ftp(DATA_URL, DATA_PATH)\n", + " fetch_demo_data(DATA_URL, DATA_PATH)\n", " _df = pd.read_csv(DATA_PATH, **kwargs)\n", " return _df" ] diff --git a/docs/reference/notebooks/m5_sales_prediction_lgbm.ipynb b/docs/reference/notebooks/m5_sales_prediction_lgbm.ipynb index 06e8e45a86..cd35399500 100644 --- a/docs/reference/notebooks/m5_sales_prediction_lgbm.ipynb +++ b/docs/reference/notebooks/m5_sales_prediction_lgbm.ipynb @@ -74,8 +74,8 @@ "from urllib.request import urlretrieve\n", "\n", "import pandas as pd\n", - "from sklearn import preprocessing\n", "from lightgbm import LGBMRegressor\n", + "from sklearn import preprocessing\n", "from sklearn.metrics import r2_score\n", "\n", "from giskard import Dataset, Model, scan, testing" @@ -109,9 +109,9 @@ "SPLIT_DATE = \"2016-03-27\"\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/m5_sales_prediction_dataset/{}\"\n", + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/m5_sales_prediction_dataset-{}\"\n", "DATA_PATH = Path.home() / \".giskard\" / \"m5_sales_prediction_dataset\"\n", - "DATA_FILES = [\"calendar.csv\", \"sales_train_validation.csv\", \"sell_prices.csv\"]" + "DATA_FILES = [\"calendar.csv.tar.gz\", \"sales_train_validation.csv.tar.gz\", \"sell_prices.csv.tar.gz\"]" ] }, { @@ -140,7 +140,7 @@ }, "outputs": [], "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -157,16 +157,16 @@ " for file_name in DATA_FILES:\n", " source = DATA_URL.format(file_name)\n", " destination = DATA_PATH / file_name\n", - " fetch_from_ftp(source, destination)\n", + " fetch_demo_data(source, destination)\n", "\n", "\n", "def load_data(n_series_use: int = 100) -> Tuple[pd.DataFrame, ...]:\n", " \"\"\"Load necessary data files.\"\"\"\n", " fetch_dataset()\n", " \n", - " calendar_df = pd.read_csv(DATA_PATH / \"calendar.csv\")\n", - " prices_df = pd.read_csv(DATA_PATH / 'sell_prices.csv')\n", - " sales_df = pd.read_csv(DATA_PATH / 'sales_train_validation.csv')\n", + " calendar_df = pd.read_csv(DATA_PATH / \"calendar.csv.tar.gz\")\n", + " prices_df = pd.read_csv(DATA_PATH / 'sell_prices.csv.tar.gz')\n", + " sales_df = pd.read_csv(DATA_PATH / 'sales_train_validation.csv.tar.gz')\n", " sales_df = sales_df.iloc[:n_series_use]\n", " \n", " return calendar_df, prices_df, sales_df\n", diff --git a/docs/reference/notebooks/medical_transcript_classification_sklearn.ipynb b/docs/reference/notebooks/medical_transcript_classification_sklearn.ipynb index 4694220c18..b94dd75911 100644 --- a/docs/reference/notebooks/medical_transcript_classification_sklearn.ipynb +++ b/docs/reference/notebooks/medical_transcript_classification_sklearn.ipynb @@ -95,6 +95,7 @@ "source": [ "import string\n", "from pathlib import Path\n", + "from typing import Iterable\n", "from urllib.request import urlretrieve\n", "\n", "import nltk\n", @@ -107,7 +108,6 @@ "from sklearn.model_selection import train_test_split\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import FunctionTransformer\n", - "from typing import Iterable\n", "\n", "from giskard import Dataset, Model, scan, testing" ] @@ -154,8 +154,8 @@ "RANDOM_SEED = 8888\n", "\n", "# Data.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/medical_transcript_classification_dataset/mtsamples.csv\"\n", - "DATA_PATH = Path.home() / \".giskard\" / \"medical_transcript_classification_dataset\" / \"mtsamples.csv\"" + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/medical_transcript_classification_dataset-mtsamples.csv.tar.gz\"\n", + "DATA_PATH = Path.home() / \".giskard\" / \"medical_transcript_classification_dataset\" / \"mtsamples.csv.tar.gz\"" ] }, { @@ -214,7 +214,7 @@ }, "outputs": [], "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -228,7 +228,7 @@ "\n", "def load_data() -> pd.DataFrame:\n", " \"\"\"Load and initially preprocess data.\"\"\"\n", - " fetch_from_ftp(DATA_URL, DATA_PATH)\n", + " fetch_demo_data(DATA_URL, DATA_PATH)\n", "\n", " df = pd.read_csv(DATA_PATH)\n", "\n", diff --git a/docs/reference/notebooks/movie_review_sentiment_classification_pytorch_sklearn.ipynb b/docs/reference/notebooks/movie_review_sentiment_classification_pytorch_sklearn.ipynb index 3fe599fbcd..69b110f563 100644 --- a/docs/reference/notebooks/movie_review_sentiment_classification_pytorch_sklearn.ipynb +++ b/docs/reference/notebooks/movie_review_sentiment_classification_pytorch_sklearn.ipynb @@ -105,8 +105,8 @@ "RANDOM_STATE = 0\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/movie_review_sentiment_classification_dataset/train.jsonl\"\n", - "DATA_PATH = Path.home() / \".giskard\" / \"movie_review_sentiment_classification_dataset\" / \"train.jsonl\"" + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/movie_review_sentiment_classification_dataset-train.jsonl.tar.gz\"\n", + "DATA_PATH = Path.home() / \".giskard\" / \"tripadvisor_reviews_dataset\" / \"train.jsonl.tar.gz\"" ] }, { @@ -135,7 +135,7 @@ }, "outputs": [], "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", "\n", @@ -148,7 +148,7 @@ "\n", "def load_data(**kwargs) -> pd.DataFrame:\n", " \"\"\"Load data.\"\"\"\n", - " fetch_from_ftp(DATA_URL, DATA_PATH)\n", + " fetch_demo_data(DATA_URL, DATA_PATH)\n", "\n", " df = pd.read_json(DATA_PATH, lines=True, **kwargs)\n", " df = df.drop(columns=\"label_text\")\n", diff --git a/docs/reference/notebooks/tripadvisor_sentiment_classification.ipynb b/docs/reference/notebooks/tripadvisor_sentiment_classification.ipynb index 177c145245..3c1afd870a 100644 --- a/docs/reference/notebooks/tripadvisor_sentiment_classification.ipynb +++ b/docs/reference/notebooks/tripadvisor_sentiment_classification.ipynb @@ -75,6 +75,7 @@ "import string\n", "from dataclasses import dataclass\n", "from pathlib import Path\n", + "from typing import Union, List\n", "from urllib.request import urlretrieve\n", "\n", "import nltk\n", @@ -85,7 +86,6 @@ "from torch.utils.data import DataLoader\n", "from torch.utils.data import TensorDataset\n", "from transformers import DistilBertForSequenceClassification, DistilBertTokenizer\n", - "from typing import Union, List\n", "\n", "from giskard import Dataset, Model, scan, testing" ] @@ -123,9 +123,9 @@ "STOP_WORDS = set(stopwords.words('english'))\n", "RANDOM_SEED = 0\n", "\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/tripadvisor_reviews_dataset/{}\"\n", + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/tripadvisor_reviews_dataset-{}\"\n", "DATA_PATH = Path.home() / \".giskard\" / \"tripadvisor_reviews_dataset\"\n", - "DATA_FILE_NAME = \"tripadvisor_hotel_reviews.csv\"" + "DATA_FILE_NAME = \"tripadvisor_hotel_reviews.csv.tar.gz\"" ] }, { @@ -181,7 +181,7 @@ "\n", "\n", "# Define data download and pre-processing functions\n", - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", "\n", @@ -277,7 +277,7 @@ "\n", "def load_dataset() -> pd.DataFrame:\n", " # Download dataset\n", - " fetch_from_ftp(DATA_URL.format(DATA_FILE_NAME), DATA_PATH / DATA_FILE_NAME)\n", + " fetch_demo_data(DATA_URL.format(DATA_FILE_NAME), DATA_PATH / DATA_FILE_NAME)\n", " df = pd.read_csv(DATA_PATH / DATA_FILE_NAME, nrows=MAX_NUM_ROWS)\n", " # Obtain labels for our task.\n", " df[TARGET_COLUMN_NAME] = df.Rating.apply(lambda x: create_label(x))\n", diff --git a/docs/reference/notebooks/wage_classification.ipynb b/docs/reference/notebooks/wage_classification.ipynb index d1053a4b7e..1cb2c755eb 100644 --- a/docs/reference/notebooks/wage_classification.ipynb +++ b/docs/reference/notebooks/wage_classification.ipynb @@ -90,16 +90,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-09T16:41:07.873921Z", - "start_time": "2023-11-09T16:41:07.824526Z" - }, - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ "# Constants\n", "RANDOM_SEED = 0\n", @@ -131,8 +125,8 @@ "TARGET_COLUMN = \"income\"\n", "\n", "# Paths.\n", - "DATA_URL = \"ftp://sys.giskard.ai/pub/unit_test_resources/wage_classification_dataset/adult.csv\"\n", - "DATA_PATH = Path.home() / \".giskard\" / \"wage_classification_dataset\" / \"adult.csv\"" + "DATA_URL = \"https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/wage_classification_dataset-adult.csv.tar.gz\"\n", + "DATA_PATH = Path.home() / \".giskard\" / \"wage_classification_dataset\" / \"adult.csv.tar.gz\"" ] }, { @@ -154,18 +148,12 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2023-11-09T16:41:09.554619Z", - "start_time": "2023-11-09T16:41:09.531213Z" - }, - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ - "def fetch_from_ftp(url: str, file: Path) -> None:\n", + "def fetch_demo_data(url: str, file: Path) -> None:\n", " \"\"\"Helper to fetch data from the FTP server.\"\"\"\n", " if not file.parent.exists():\n", " file.parent.mkdir(parents=True, exist_ok=True)\n", @@ -179,7 +167,7 @@ "\n", "def download_data(**kwargs) -> pd.DataFrame:\n", " \"\"\"Download the dataset using URL.\"\"\"\n", - " fetch_from_ftp(DATA_URL, DATA_PATH)\n", + " fetch_demo_data(DATA_URL, DATA_PATH)\n", " _df = pd.read_csv(DATA_PATH, **kwargs)\n", " return _df\n", "\n", @@ -192,12 +180,10 @@ ] }, { + "metadata": {}, "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, "outputs": [], + "execution_count": null, "source": [ "income_df = download_data()\n", "income_df = preprocess_data(income_df)" diff --git a/tests/fixtures/amazon_review__binary_classification.py b/tests/fixtures/amazon_review__binary_classification.py index f19d883a09..0550bece7b 100644 --- a/tests/fixtures/amazon_review__binary_classification.py +++ b/tests/fixtures/amazon_review__binary_classification.py @@ -12,7 +12,7 @@ from giskard import Dataset from giskard.models.sklearn import SKLearnModel -from tests.url_utils import fetch_from_ftp +from tests.url_utils import fetch_test_data # Constants. RANDOM_SEED = 0 @@ -24,12 +24,12 @@ FEATURE_COLUMN_NAME = "reviewText" # Data. -DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/amazon_review_dataset/reviews.json" -DATA_PATH = Path.home() / ".giskard" / "amazon_review_dataset" / "reviews.json" +DATA_URL = "https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/amazon_review_dataset-reviews.json.tar.gz" +DATA_PATH = Path.home() / ".giskard" / "amazon_review_dataset" / "reviews.json.tar.gz" def download_data(**kwargs) -> pd.DataFrame: - fetch_from_ftp(DATA_URL, DATA_PATH) + fetch_test_data(DATA_URL, DATA_PATH) _df = pd.read_json(DATA_PATH, lines=True, **kwargs) return _df diff --git a/tests/fixtures/drug_classification__multiclass_classification.py b/tests/fixtures/drug_classification__multiclass_classification.py index 4366edf300..1132227cf3 100644 --- a/tests/fixtures/drug_classification__multiclass_classification.py +++ b/tests/fixtures/drug_classification__multiclass_classification.py @@ -11,11 +11,13 @@ from giskard import Dataset from giskard.models.sklearn import SKLearnModel -from tests.url_utils import fetch_from_ftp +from tests.url_utils import fetch_test_data # Data. -DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/drug_classification_dataset/drug200.csv" -DATA_PATH = Path.home() / ".giskard" / "drug_classification_dataset" / "drug200.csv" +DATA_URL = ( + "https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/drug_classification_dataset-drug200.csv.tar.gz" +) +DATA_PATH = Path.home() / ".giskard" / "drug_classification_dataset" / "drug200.csv.tar.gz" # Constants. TARGET_NAME = "Drug" @@ -50,7 +52,7 @@ def _bin_na_to_k(_df: pd.DataFrame) -> pd.DataFrame: @pytest.fixture(scope="session") def drug_classification_raw_data() -> pd.DataFrame: # Download data. - fetch_from_ftp(DATA_URL, DATA_PATH) + fetch_test_data(DATA_URL, DATA_PATH) # Load and wrap data. raw_data = bin_numerical(pd.read_csv(DATA_PATH)) diff --git a/tests/fixtures/fraud_detection__binary_classification.py b/tests/fixtures/fraud_detection__binary_classification.py index f4c64af4a3..d52d3ff7c3 100644 --- a/tests/fixtures/fraud_detection__binary_classification.py +++ b/tests/fixtures/fraud_detection__binary_classification.py @@ -9,10 +9,10 @@ from sklearn.model_selection import train_test_split from giskard import Dataset, Model -from tests.url_utils import fetch_from_ftp +from tests.url_utils import fetch_test_data # Data. -DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/fraud_detection_classification_dataset/{}" +DATA_URL = "https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/fraud_detection_classification_dataset-{}" DATA_PATH = Path.home() / ".giskard" / "fraud_detection_classification_dataset" # Constants. @@ -86,9 +86,14 @@ def fetch_dataset(): - files_to_fetch = ["train_transaction.csv", "train_identity.csv", "test_transaction.csv", "test_identity.csv"] + files_to_fetch = [ + "train_transaction.csv.tar.gz", + "train_identity.csv.tar.gz", + "test_transaction.csv.tar.gz", + "test_identity.csv.tar.gz", + ] for file_name in files_to_fetch: - fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name) + fetch_test_data(DATA_URL.format(file_name), DATA_PATH / file_name) def read_set(_type, nrows=150): @@ -96,9 +101,9 @@ def read_set(_type, nrows=150): fetch_dataset() _df = pd.read_csv( - DATA_PATH / f"{_type}_transaction.csv", index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=nrows + DATA_PATH / f"{_type}_transaction.csv.tar.gz", index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=nrows ) - _df = _df.join(pd.read_csv(DATA_PATH / f"{_type}_identity.csv", index_col=IDX_LABEL, dtype=DATA_TYPES_ID)) + _df = _df.join(pd.read_csv(DATA_PATH / f"{_type}_identity.csv.tar.gz", index_col=IDX_LABEL, dtype=DATA_TYPES_ID)) return _df diff --git a/tests/fixtures/hotel_text__regression.py b/tests/fixtures/hotel_text__regression.py index 32a0d891cf..ba9ba21081 100644 --- a/tests/fixtures/hotel_text__regression.py +++ b/tests/fixtures/hotel_text__regression.py @@ -11,11 +11,11 @@ from giskard import Dataset from giskard.models.sklearn import SKLearnModel -from tests.url_utils import fetch_from_ftp +from tests.url_utils import fetch_test_data # Data. -DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/hotel_text_regression_dataset/Hotel_Reviews.csv" -DATA_PATH = Path.home() / ".giskard" / "hotel_text_regression_dataset" / "Hotel_Reviews.csv" +DATA_URL = "https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/hotel_text_regression_dataset-Hotel_Reviews.csv.tar.gz" +DATA_PATH = Path.home() / ".giskard" / "hotel_text_regression_dataset" / "Hotel_Reviews.csv.tar.gz" # Constants. FEATURE_COLUMN_NAME = "Full_Review" @@ -33,7 +33,7 @@ def load_data(**kwargs) -> pd.DataFrame: @pytest.fixture(scope="session") def hotel_text_raw_data(): - fetch_from_ftp(DATA_URL, DATA_PATH) + fetch_test_data(DATA_URL, DATA_PATH) raw_data = load_data(nrows=105)[[FEATURE_COLUMN_NAME, TARGET_COLUMN_NAME]] return raw_data diff --git a/tests/fixtures/medical_transcript_multiclass_classification.py b/tests/fixtures/medical_transcript_multiclass_classification.py index ea4acf349d..8652a75227 100644 --- a/tests/fixtures/medical_transcript_multiclass_classification.py +++ b/tests/fixtures/medical_transcript_multiclass_classification.py @@ -12,7 +12,7 @@ from giskard import Dataset from giskard.models.sklearn import SKLearnModel -from tests.url_utils import fetch_from_ftp +from tests.url_utils import fetch_test_data # Constants. LABELS_LIST = [ @@ -29,13 +29,13 @@ LANGUAGE = "english" # Paths. -DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/medical_transcript_classification_dataset/mtsamples.csv" -DATA_PATH = Path.home() / ".giskard" / "medical_transcript_classification_dataset" / "mtsamples.csv" +DATA_URL = "https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/medical_transcript_classification_dataset-mtsamples.csv.tar.gz" +DATA_PATH = Path.home() / ".giskard" / "medical_transcript_classification_dataset" / "mtsamples.csv.tar.gz" def load_data() -> pd.DataFrame: # Download dataset. - fetch_from_ftp(DATA_URL, DATA_PATH) + fetch_test_data(DATA_URL, DATA_PATH) df = pd.read_csv(DATA_PATH) # Drop useless columns. diff --git a/tests/fixtures/tripadvisor_text_classification_torch.py b/tests/fixtures/tripadvisor_text_classification_torch.py index 78002c551b..61977a5b48 100644 --- a/tests/fixtures/tripadvisor_text_classification_torch.py +++ b/tests/fixtures/tripadvisor_text_classification_torch.py @@ -13,12 +13,12 @@ from transformers import DistilBertForSequenceClassification, DistilBertTokenizer from giskard import Dataset, Model, models -from tests.url_utils import fetch_from_ftp +from tests.url_utils import fetch_test_data # Data -DATA_URL = "ftp://sys.giskard.ai/pub/unit_test_resources/tripadvisor_reviews_dataset/{}" +DATA_URL = "https://giskard-library-test-datasets.s3.eu-north-1.amazonaws.com/tripadvisor_reviews_dataset-{}" DATA_PATH = Path.home() / ".giskard" / "tripadvisor_reviews_dataset" -DATA_FILE_NAME = "tripadvisor_hotel_reviews.csv" +DATA_FILE_NAME = "tripadvisor_hotel_reviews.csv.tar.gz" # Constants PRETRAINED_WEIGHTS_NAME = "distilbert-base-uncased" @@ -115,7 +115,7 @@ def text_preprocessor(df: pd.DataFrame) -> pd.DataFrame: def load_dataset() -> pd.DataFrame: # Download dataset - fetch_from_ftp(DATA_URL.format(DATA_FILE_NAME), DATA_PATH / DATA_FILE_NAME) + fetch_test_data(DATA_URL.format(DATA_FILE_NAME), DATA_PATH / DATA_FILE_NAME) df = pd.read_csv(DATA_PATH / DATA_FILE_NAME, nrows=MAX_NUM_ROWS) # Obtain labels for our task. df[TARGET_COLUMN_NAME] = df.Rating.apply(lambda x: create_label(x)) diff --git a/tests/url_utils.py b/tests/url_utils.py index e198afd87d..074cb0dac6 100644 --- a/tests/url_utils.py +++ b/tests/url_utils.py @@ -2,7 +2,7 @@ from urllib.request import urlretrieve -def fetch_from_ftp(url: str, file: Path) -> None: +def fetch_test_data(url: str, file: Path) -> None: if not file.parent.exists(): file.parent.mkdir(parents=True, exist_ok=True)