From 0748c2d4db0a09a27ee669d26a49b9bc6268eed3 Mon Sep 17 00:00:00 2001 From: ev Date: Mon, 17 Oct 2022 20:09:39 +0100 Subject: [PATCH 01/12] start adding connector --- .../source-wikipedia-pageviews/.dockerignore | 6 ++ .../source-wikipedia-pageviews/Dockerfile | 38 +++++++++ .../source-wikipedia-pageviews/README.md | 79 +++++++++++++++++++ .../source-wikipedia-pageviews/__init__.py | 3 + .../acceptance-test-config.yml | 30 +++++++ .../acceptance-test-docker.sh | 16 ++++ .../source-wikipedia-pageviews/build.gradle | 9 +++ .../integration_tests/__init__.py | 3 + .../integration_tests/abnormal_state.json | 5 ++ .../integration_tests/acceptance.py | 16 ++++ .../integration_tests/catalog.json | 39 +++++++++ .../integration_tests/configured_catalog.json | 22 ++++++ .../integration_tests/invalid_config.json | 3 + .../integration_tests/sample_config.json | 3 + .../integration_tests/sample_state.json | 5 ++ .../source-wikipedia-pageviews/main.py | 13 +++ .../requirements.txt | 2 + .../source-wikipedia-pageviews/setup.py | 29 +++++++ .../source_wikipedia_pageviews/__init__.py | 8 ++ .../schemas/TODO.md | 16 ++++ .../schemas/customers.json | 16 ++++ .../schemas/employees.json | 19 +++++ .../source_wikipedia_pageviews/source.py | 18 +++++ .../source_wikipedia_pageviews/spec.yaml | 74 +++++++++++++++++ .../wikipedia_pageviews.yaml | 63 +++++++++++++++ 25 files changed, 535 insertions(+) create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/.dockerignore create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/README.md create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/__init__.py create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-docker.sh create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/build.gradle create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/__init__.py create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/acceptance.py create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/catalog.json create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/main.py create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/requirements.txt create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/__init__.py create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/TODO.md create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/customers.json create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/employees.json create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/source.py create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml create mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/.dockerignore b/airbyte-integrations/connectors/source-wikipedia-pageviews/.dockerignore new file mode 100755 index 0000000000000..ef12d74d21bcc --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/.dockerignore @@ -0,0 +1,6 @@ +* +!Dockerfile +!main.py +!source_wikipedia_pageviews +!setup.py +!secrets diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile b/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile new file mode 100755 index 0000000000000..1bc63f17e8b48 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.9.11-alpine3.15 as base + +# build and load all requirements +FROM base as builder +WORKDIR /airbyte/integration_code + +# upgrade pip to the latest version +RUN apk --no-cache upgrade \ + && pip install --upgrade pip \ + && apk --no-cache add tzdata build-base + + +COPY setup.py ./ +# install necessary packages to a temporary folder +RUN pip install --prefix=/install . + +# build a clean environment +FROM base +WORKDIR /airbyte/integration_code + +# copy all loaded and built libraries to a pure basic image +COPY --from=builder /install /usr/local +# add default timezone settings +COPY --from=builder /usr/share/zoneinfo/Etc/UTC /etc/localtime +RUN echo "Etc/UTC" > /etc/timezone + +# bash is installed for more convenient debugging. +RUN apk --no-cache add bash + +# copy payload code only +COPY main.py ./ +COPY source_wikipedia_pageviews ./source_wikipedia_pageviews + +ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" +ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] + +LABEL io.airbyte.version=0.1.0 +LABEL io.airbyte.name=airbyte/source-wikipedia-pageviews diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/README.md b/airbyte-integrations/connectors/source-wikipedia-pageviews/README.md new file mode 100755 index 0000000000000..03cbba6a4e21c --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/README.md @@ -0,0 +1,79 @@ +# Wikipedia Pageviews Source + +This is the repository for the Wikipedia Pageviews configuration based source connector. +For information about how to use this connector within Airbyte, see [the documentation](https://docs.airbyte.io/integrations/sources/wikipedia-pageviews). + +## Local development + +#### Building via Gradle +You can also build the connector in Gradle. This is typically used in CI and not needed for your development workflow. + +To build using Gradle, from the Airbyte repository root, run: +``` +./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:build +``` + +#### Create credentials +**If you are a community contributor**, follow the instructions in the [documentation](https://docs.airbyte.io/integrations/sources/wikipedia-pageviews) +to generate the necessary credentials. Then create a file `secrets/config.json` conforming to the `source_wikipedia_pageviews/spec.yaml` file. +Note that any directory named `secrets` is gitignored across the entire Airbyte repo, so there is no danger of accidentally checking in sensitive information. +See `integration_tests/sample_config.json` for a sample config file. + +**If you are an Airbyte core member**, copy the credentials in Lastpass under the secret name `source wikipedia-pageviews test creds` +and place them into `secrets/config.json`. + +### Locally running the connector docker image + +#### Build +First, make sure you build the latest Docker image: +``` +docker build . -t airbyte/source-wikipedia-pageviews:dev +``` + +You can also build the connector image via Gradle: +``` +./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:airbyteDocker +``` +When building via Gradle, the docker image name and tag, respectively, are the values of the `io.airbyte.name` and `io.airbyte.version` `LABEL`s in +the Dockerfile. + +#### Run +Then run any of the connector commands as follows: +``` +docker run --rm airbyte/source-wikipedia-pageviews:dev spec +docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-wikipedia-pageviews:dev check --config /secrets/config.json +docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-wikipedia-pageviews:dev discover --config /secrets/config.json +docker run --rm -v $(pwd)/secrets:/secrets -v $(pwd)/integration_tests:/integration_tests airbyte/source-wikipedia-pageviews:dev read --config /secrets/config.json --catalog /integration_tests/configured_catalog.json +``` +## Testing + +#### Acceptance Tests +Customize `acceptance-test-config.yml` file to configure tests. See [Source Acceptance Tests](https://docs.airbyte.io/connector-development/testing-connectors/source-acceptance-tests-reference) for more information. +If your connector requires to create or destroy resources for use during acceptance tests create fixtures for it and place them inside integration_tests/acceptance.py. + +To run your integration tests with docker + +### Using gradle to run tests +All commands should be run from airbyte project root. +To run unit tests: +``` +./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:unitTest +``` +To run acceptance and custom integration tests: +``` +./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:integrationTest +``` + +## Dependency Management +All of your dependencies should go in `setup.py`, NOT `requirements.txt`. The requirements file is only used to connect internal Airbyte dependencies in the monorepo for local development. +We split dependencies between two groups, dependencies that are: +* required for your connector to work need to go to `MAIN_REQUIREMENTS` list. +* required for the testing need to go to `TEST_REQUIREMENTS` list + +### Publishing a new version of the connector +You've checked out the repo, implemented a million dollar feature, and you're ready to share your changes with the world. Now what? +1. Make sure your changes are passing unit and integration tests. +1. Bump the connector version in `Dockerfile` -- just increment the value of the `LABEL io.airbyte.version` appropriately (we use [SemVer](https://semver.org/)). +1. Create a Pull Request. +1. Pat yourself on the back for being an awesome contributor. +1. Someone from Airbyte will take a look at your PR and iterate with you to merge it into master. diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/__init__.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/__init__.py new file mode 100755 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml new file mode 100755 index 0000000000000..2ddd976a412bc --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml @@ -0,0 +1,30 @@ +# See [Source Acceptance Tests](https://docs.airbyte.io/connector-development/testing-connectors/source-acceptance-tests-reference) +# for more information about how to configure these tests +connector_image: airbyte/source-wikipedia-pageviews:dev +tests: + spec: + - spec_path: "source_wikipedia_pageviews/spec.yaml" + connection: + - config_path: "secrets/config.json" + status: "succeed" + - config_path: "integration_tests/invalid_config.json" + status: "failed" + discovery: + - config_path: "secrets/config.json" + basic_read: + - config_path: "secrets/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" + empty_streams: [] + # TODO uncomment this block to specify that the tests should assert the connector outputs the records provided in the input file a file + # expect_records: + # path: "integration_tests/expected_records.txt" + # extra_fields: no + # exact_order: no + # extra_records: yes + incremental: # TODO if your connector does not implement incremental sync, remove this block + - config_path: "secrets/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" + future_state_path: "integration_tests/abnormal_state.json" + full_refresh: + - config_path: "secrets/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-docker.sh b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-docker.sh new file mode 100755 index 0000000000000..c51577d10690c --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-docker.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env sh + +# Build latest connector image +docker build . -t $(cat acceptance-test-config.yml | grep "connector_image" | head -n 1 | cut -d: -f2-) + +# Pull latest acctest image +docker pull airbyte/source-acceptance-test:latest + +# Run +docker run --rm -it \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -v /tmp:/tmp \ + -v $(pwd):/test_input \ + airbyte/source-acceptance-test \ + --acceptance-test-config /test_input + diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/build.gradle b/airbyte-integrations/connectors/source-wikipedia-pageviews/build.gradle new file mode 100755 index 0000000000000..2900db7ffacb8 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/build.gradle @@ -0,0 +1,9 @@ +plugins { + id 'airbyte-python' + id 'airbyte-docker' + id 'airbyte-source-acceptance-test' +} + +airbytePython { + moduleDirectory 'source_wikipedia_pageviews' +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/__init__.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/__init__.py new file mode 100755 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json new file mode 100755 index 0000000000000..52b0f2c2118f4 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json @@ -0,0 +1,5 @@ +{ + "todo-stream-name": { + "todo-field-name": "todo-abnormal-value" + } +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/acceptance.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/acceptance.py new file mode 100755 index 0000000000000..1302b2f57e10e --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/acceptance.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +import pytest + +pytest_plugins = ("source_acceptance_test.plugin",) + + +@pytest.fixture(scope="session", autouse=True) +def connector_setup(): + """This fixture is a placeholder for external resources that acceptance test might require.""" + # TODO: setup test dependencies if needed. otherwise remove the TODO comments + yield + # TODO: clean up test dependencies diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/catalog.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/catalog.json new file mode 100755 index 0000000000000..6799946a68514 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/catalog.json @@ -0,0 +1,39 @@ +{ + "streams": [ + { + "name": "TODO fix this file", + "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": true, + "default_cursor_field": "column1", + "json_schema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "column1": { + "type": "string" + }, + "column2": { + "type": "number" + } + } + } + }, + { + "name": "table1", + "supported_sync_modes": ["full_refresh", "incremental"], + "source_defined_cursor": false, + "json_schema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "column1": { + "type": "string" + }, + "column2": { + "type": "number" + } + } + } + } + ] +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json new file mode 100755 index 0000000000000..36f0468db0d8f --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json @@ -0,0 +1,22 @@ +{ + "streams": [ + { + "stream": { + "name": "customers", + "json_schema": {}, + "supported_sync_modes": ["full_refresh"] + }, + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite" + }, + { + "stream": { + "name": "employees", + "json_schema": {}, + "supported_sync_modes": ["full_refresh", "incremental"] + }, + "sync_mode": "incremental", + "destination_sync_mode": "append" + } + ] +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json new file mode 100755 index 0000000000000..f3732995784f2 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json @@ -0,0 +1,3 @@ +{ + "todo-wrong-field": "this should be an incomplete config file, used in standard tests" +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json new file mode 100755 index 0000000000000..ecc4913b84c74 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json @@ -0,0 +1,3 @@ +{ + "fix-me": "TODO" +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json new file mode 100755 index 0000000000000..3587e579822d0 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json @@ -0,0 +1,5 @@ +{ + "todo-stream-name": { + "todo-field-name": "value" + } +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/main.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/main.py new file mode 100755 index 0000000000000..6cd40d0b94cae --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/main.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +import sys + +from airbyte_cdk.entrypoint import launch +from source_wikipedia_pageviews import SourceWikipediaPageviews + +if __name__ == "__main__": + source = SourceWikipediaPageviews() + launch(source, sys.argv[1:]) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/requirements.txt b/airbyte-integrations/connectors/source-wikipedia-pageviews/requirements.txt new file mode 100755 index 0000000000000..0411042aa0911 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/requirements.txt @@ -0,0 +1,2 @@ +-e ../../bases/source-acceptance-test +-e . diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py new file mode 100755 index 0000000000000..0b7d663fc0336 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py @@ -0,0 +1,29 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from setuptools import find_packages, setup + +MAIN_REQUIREMENTS = [ + "airbyte-cdk~=0.1", +] + +TEST_REQUIREMENTS = [ + "pytest~=6.1", + "pytest-mock~=3.6.1", + "source-acceptance-test", +] + +setup( + name="source_wikipedia_pageviews", + description="Source implementation for Wikipedia Pageviews.", + author="Airbyte", + author_email="contact@airbyte.io", + packages=find_packages(), + install_requires=MAIN_REQUIREMENTS, + package_data={"": ["*.json", "*.yaml", "schemas/*.json", "schemas/shared/*.json"]}, + extras_require={ + "tests": TEST_REQUIREMENTS, + }, +) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/__init__.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/__init__.py new file mode 100755 index 0000000000000..d52fb1e5ee55b --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from .source import SourceWikipediaPageviews + +__all__ = ["SourceWikipediaPageviews"] diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/TODO.md b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/TODO.md new file mode 100755 index 0000000000000..d63eaa1f57ee6 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/TODO.md @@ -0,0 +1,16 @@ +# TODO: Define your stream schemas +Your connector must describe the schema of each stream it can output using [JSONSchema](https://json-schema.org). + +You can describe the schema of your streams using one `.json` file per stream. + +## Static schemas +From the `wikipedia_pageviews.yaml` configuration file, you read the `.json` files in the `schemas/` directory. You can refer to a schema in your configuration file using the `schema_loader` component's `file_path` field. For example: +``` +schema_loader: + type: JsonSchema + file_path: "./source_wikipedia_pageviews/schemas/customers.json" +``` +Every stream specified in the configuration file should have a corresponding `.json` schema file. + +Delete this file once you're done. Or don't. Up to you :) + diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/customers.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/customers.json new file mode 100755 index 0000000000000..9a4b134858363 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/customers.json @@ -0,0 +1,16 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "signup_date": { + "type": ["null", "string"], + "format": "date-time" + } + } +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/employees.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/employees.json new file mode 100755 index 0000000000000..2fa01a0fa1ff9 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/employees.json @@ -0,0 +1,19 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "id": { + "type": ["null", "string"] + }, + "name": { + "type": ["null", "string"] + }, + "years_of_service": { + "type": ["null", "integer"] + }, + "start_date": { + "type": ["null", "string"], + "format": "date-time" + } + } +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/source.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/source.py new file mode 100755 index 0000000000000..19ed7ee95fb34 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/source.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource + +""" +This file provides the necessary constructs to interpret a provided declarative YAML configuration file into +source connector. + +WARNING: Do not modify this file. +""" + + +# Declarative Source +class SourceWikipediaPageviews(YamlDeclarativeSource): + def __init__(self): + super().__init__(**{"path_to_yaml": "wikipedia_pageviews.yaml"}) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml new file mode 100755 index 0000000000000..485ae300c761e --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml @@ -0,0 +1,74 @@ +documentationUrl: https://docsurl.com +connectionSpecification: + $schema: http://json-schema.org/draft-07/schema# + title: Wikipedia Pageviews Spec + type: object + required: + - project + - access + - agent + - article + - start + - end + - year + - month + - day + - country + additionalProperties: true + properties: + # 'TODO: This schema defines the configuration required for the source. This usually involves metadata such as database and/or authentication information.': + project: + type: string + description: If you want to filter by project, use the domain of any Wikimedia project. + examples: + - en.wikipedia.org + - www.mediawiki.org + - commons.wikimedia.org + access: + type: string + description: If you want to filter by access method, use one of desktop, mobile-app or mobile-web. If you are interested in pageviews regardless of access method, use all-access. + examples: + - all-access + - desktop + - mobile-app + - mobile-web + agent: + type: string + description: If you want to filter by agent type, use one of user, automated or spider. If you are interested in pageviews regardless of agent type, use all-agents. + examples: + - all-agents + - user + - spider + - automated + article: + type: string + description: The title of any article in the specified project. Any spaces should be replaced with underscores. It also should be URI-encoded, so that non-URI-safe characters like %, / or ? are accepted. + examples: + - Are_You_the_One%3F + # granularity: + # type: string + # description: The time unit for the response data. As of today, the only supported granularity for this endpoint is daily and monthly. + # examples: + # - daily + # - monthly + start: + type: string + description: The date of the first day to include, in YYYYMMDD or YYYYMMDDHH format. + end: + type: string + description: The date of the last day to include, in YYYYMMDD or YYYYMMDDHH format. + year: + type: string + description: Year in YYYY format. + month: + type: string + description: Month in MM format. + day: + type: string + description: Day in DD format. + country: + type: string + description: The ISO 3166-1 alpha-2 code of a country for which to retrieve top articles. + examples: + - FR + - IN diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml new file mode 100755 index 0000000000000..36fe3b2e82c32 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml @@ -0,0 +1,63 @@ +version: "0.1.0" + +definitions: + selector: + extractor: + field_pointer: [ "items" ] + requester: + url_base: "https://wikimedia.org/api/rest_v1/metrics/pageviews" + http_method: "GET" + # stream_slicer: + # start_datetime: "{{config.start}}" + # end_datetime: "{{config.end}}" + # step: "1d" + # start_time_option: + # field_name: "start" + # inject_into: "path" + # end_time_option: + # field_name: "end" + # inject_into: "path" + per_article_requester: + $options: + $ref: "*ref(definitions.requester)" + path: "/per-article/{{config.project}}/{{config.access}}/{{config.agent}}/daily/{{config.start}}/{{config.end}}" + top_requester: + $options: + $ref: "*ref(definitions.requester)" + path: "/top/{{config.project}}/{{config.access}}/{{config.year}}/{{config.month}}/{{config.day}}" + + per_article_retriever: + record_selector: + $ref: "*ref(definitions.selector)" + paginator: + type: NoPagination + requester: + $ref: "*ref(definitions.per_article_requester)" + # stream_slicer: + # $ref: "*ref(definitions.stream_slicer)" + top_retriever: + record_selector: + $ref: "*ref(definitions.selector)" + paginator: + type: NoPagination + requester: + $ref: "*ref(definitions.top_requester)" + per_article_stream: + retriever: + $ref: "*ref(definitions.per_article_retriever)" + $options: + name: "per-article" + primary_key: "article" + top_stream: + retriever: + $ref: "*ref(definitions.top_retriever)" + $options: + name: "top" + primary_key: "project" + +streams: ["*ref(definitions.per_article_stream)"] + # - "*ref(definitions.per_article_stream)" + # - "*ref(definitions.top_stream)" + +check: + stream_names: ["per-article"] From bdd188cc9b07fc43a96a4d0336a0a6a36c481fed Mon Sep 17 00:00:00 2001 From: ev Date: Fri, 21 Oct 2022 15:18:37 +0100 Subject: [PATCH 02/12] fix check and stream slicer --- .../integration_tests/configured_catalog.json | 10 ++-- .../schemas/customers.json | 16 ------- .../schemas/employees.json | 19 -------- .../schemas/per-article.json | 36 ++++++++++++++ .../schemas/top.json | 48 +++++++++++++++++++ .../wikipedia_pageviews.yaml | 42 +++++++++------- 6 files changed, 114 insertions(+), 57 deletions(-) delete mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/customers.json delete mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/employees.json create mode 100644 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json create mode 100644 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json index 36f0468db0d8f..f97acaffc7d75 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json @@ -2,7 +2,7 @@ "streams": [ { "stream": { - "name": "customers", + "name": "per-article", "json_schema": {}, "supported_sync_modes": ["full_refresh"] }, @@ -11,12 +11,12 @@ }, { "stream": { - "name": "employees", + "name": "top", "json_schema": {}, - "supported_sync_modes": ["full_refresh", "incremental"] + "supported_sync_modes": ["full_refresh"] }, - "sync_mode": "incremental", - "destination_sync_mode": "append" + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite" } ] } diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/customers.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/customers.json deleted file mode 100755 index 9a4b134858363..0000000000000 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/customers.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "id": { - "type": ["null", "string"] - }, - "name": { - "type": ["null", "string"] - }, - "signup_date": { - "type": ["null", "string"], - "format": "date-time" - } - } -} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/employees.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/employees.json deleted file mode 100755 index 2fa01a0fa1ff9..0000000000000 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/employees.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "id": { - "type": ["null", "string"] - }, - "name": { - "type": ["null", "string"] - }, - "years_of_service": { - "type": ["null", "integer"] - }, - "start_date": { - "type": ["null", "string"], - "format": "date-time" - } - } -} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json new file mode 100644 index 0000000000000..33981dc35d733 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json @@ -0,0 +1,36 @@ +{ + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "properties": { + "project": { + "type": "string" + }, + "access": { + "type": "string" + }, + "article": { + "type": "string" + }, + "agent": { + "type": "string" + }, + "granularity": { + "type": "string" + }, + "timestamp": { + "type": "string" + }, + "views": { + "type": "integer", + "format": "int64" + } + } + } + } + }, + "$schema": "http://json-schema.org/schema#" +} \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json new file mode 100644 index 0000000000000..bd4904907a611 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json @@ -0,0 +1,48 @@ +{ + "type": "object", + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "properties": { + "project": { + "type": "string" + }, + "access": { + "type": "string" + }, + "year": { + "type": "string" + }, + "month": { + "type": "string" + }, + "day": { + "type": "string" + }, + "articles": { + "type": "array", + "items": { + "type": "object", + "properties": { + "rank": { + "type": "integer", + "format": "int32" + }, + "article": { + "type": "string" + }, + "views": { + "type": "integer", + "format": "int64" + } + } + } + } + } + } + } + }, + "$schema": "http://json-schema.org/schema#" +} \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml index 36fe3b2e82c32..2ca957c8c7bcd 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml @@ -7,20 +7,26 @@ definitions: requester: url_base: "https://wikimedia.org/api/rest_v1/metrics/pageviews" http_method: "GET" - # stream_slicer: - # start_datetime: "{{config.start}}" - # end_datetime: "{{config.end}}" - # step: "1d" - # start_time_option: - # field_name: "start" - # inject_into: "path" - # end_time_option: - # field_name: "end" - # inject_into: "path" + request_options_provider: + request_headers: + "User-Agent": "AirbyteWikipediaPageviewsConnector/1.0 (https://github.com/airbyte/airbyte)" + stream_slicer: + type: DatetimeStreamSlicer + start_datetime: "{{config.start}}" + end_datetime: "{{config.end}}" + step: "1d" + cursor_field: "timestamp" + datetime_format: "%Y%m%d" + # start_time_option: + # field_name: "start" + # inject_into: "path" + # end_time_option: + # field_name: "end" + # inject_into: "path" per_article_requester: $options: $ref: "*ref(definitions.requester)" - path: "/per-article/{{config.project}}/{{config.access}}/{{config.agent}}/daily/{{config.start}}/{{config.end}}" + path: "/per-article/{{config.project}}/{{config.access}}/{{config.agent}}/{{config.article}}/daily/{{config.start}}/{{config.end}}" top_requester: $options: $ref: "*ref(definitions.requester)" @@ -33,8 +39,8 @@ definitions: type: NoPagination requester: $ref: "*ref(definitions.per_article_requester)" - # stream_slicer: - # $ref: "*ref(definitions.stream_slicer)" + stream_slicer: + $ref: "*ref(definitions.stream_slicer)" top_retriever: record_selector: $ref: "*ref(definitions.selector)" @@ -55,9 +61,11 @@ definitions: name: "top" primary_key: "project" -streams: ["*ref(definitions.per_article_stream)"] - # - "*ref(definitions.per_article_stream)" - # - "*ref(definitions.top_stream)" +streams: + - "*ref(definitions.per_article_stream)" + - "*ref(definitions.top_stream)" check: - stream_names: ["per-article"] + stream_names: + - "per-article" + - "top" From 027951cf46fa767b350565fe3e3f08aa1e1d5b42 Mon Sep 17 00:00:00 2001 From: ev Date: Sun, 23 Oct 2022 20:50:43 +0100 Subject: [PATCH 03/12] pass some tests --- .../integration_tests/abnormal_state.json | 4 ++-- .../integration_tests/invalid_config.json | 11 ++++++++++- .../wikipedia_pageviews.yaml | 8 -------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json index 52b0f2c2118f4..43c016b9bfaa0 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json @@ -1,5 +1,5 @@ { - "todo-stream-name": { - "todo-field-name": "todo-abnormal-value" + "per-article": { + "timestamp": "2099022700" } } diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json index f3732995784f2..d98541705c479 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json @@ -1,3 +1,12 @@ { - "todo-wrong-field": "this should be an incomplete config file, used in standard tests" + "project": "en.wikipedia.org", + "access": "all-access", + "agent": "all-agents", + "article": "Are_You_the_One%3F", + "start": "", + "end": "20220101", + "year": "2022", + "month": "13", + "day": "01", + "country": "EN" } diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml index 2ca957c8c7bcd..b0226a4d8e160 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml @@ -17,12 +17,6 @@ definitions: step: "1d" cursor_field: "timestamp" datetime_format: "%Y%m%d" - # start_time_option: - # field_name: "start" - # inject_into: "path" - # end_time_option: - # field_name: "end" - # inject_into: "path" per_article_requester: $options: $ref: "*ref(definitions.requester)" @@ -53,13 +47,11 @@ definitions: $ref: "*ref(definitions.per_article_retriever)" $options: name: "per-article" - primary_key: "article" top_stream: retriever: $ref: "*ref(definitions.top_retriever)" $options: name: "top" - primary_key: "project" streams: - "*ref(definitions.per_article_stream)" From 8ec7ea4cc0d744090bbe85666c50a5a355f2d58b Mon Sep 17 00:00:00 2001 From: ev Date: Fri, 28 Oct 2022 10:16:11 +0100 Subject: [PATCH 04/12] fix: github url Co-authored-by: Marcos Marx --- .../source_wikipedia_pageviews/wikipedia_pageviews.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml index b0226a4d8e160..86c5b2fa48348 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml @@ -9,7 +9,7 @@ definitions: http_method: "GET" request_options_provider: request_headers: - "User-Agent": "AirbyteWikipediaPageviewsConnector/1.0 (https://github.com/airbyte/airbyte)" + "User-Agent": "AirbyteWikipediaPageviewsConnector/1.0 (https://github.com/airbytehq/airbyte)" stream_slicer: type: DatetimeStreamSlicer start_datetime: "{{config.start}}" From d27bbe82e997f4c425368e1b9e1d9ebf4874e2ac Mon Sep 17 00:00:00 2001 From: ev Date: Fri, 28 Oct 2022 19:48:39 +0100 Subject: [PATCH 05/12] add stream slicer for top and fix some tests --- .../acceptance-test-config.yml | 8 +-- .../integration_tests/invalid_config.json | 5 +- .../integration_tests/sample_config.json | 8 ++- .../integration_tests/sample_state.json | 4 +- .../schemas/per-article.json | 50 ++++++++--------- .../schemas/top.json | 54 ++++++++----------- .../source_wikipedia_pageviews/spec.yaml | 28 +++++----- .../wikipedia_pageviews.yaml | 22 ++++++-- 8 files changed, 92 insertions(+), 87 deletions(-) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml index 2ddd976a412bc..09ee4ecd34518 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml @@ -21,10 +21,10 @@ tests: # extra_fields: no # exact_order: no # extra_records: yes - incremental: # TODO if your connector does not implement incremental sync, remove this block - - config_path: "secrets/config.json" - configured_catalog_path: "integration_tests/configured_catalog.json" - future_state_path: "integration_tests/abnormal_state.json" + # incremental: # TODO if your connector does not implement incremental sync, remove this block + # - config_path: "secrets/config.json" + # configured_catalog_path: "integration_tests/configured_catalog.json" + # future_state_path: "integration_tests/abnormal_state.json" full_refresh: - config_path: "secrets/config.json" configured_catalog_path: "integration_tests/configured_catalog.json" diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json index d98541705c479..7040094d9ba61 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json @@ -4,9 +4,6 @@ "agent": "all-agents", "article": "Are_You_the_One%3F", "start": "", - "end": "20220101", - "year": "2022", - "month": "13", - "day": "01", + "end": "202440101", "country": "EN" } diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json index ecc4913b84c74..e5628b5061820 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json @@ -1,3 +1,9 @@ { - "fix-me": "TODO" + "project": "en.wikipedia.org", + "access": "all-access", + "agent": "all-agents", + "article": "Are_You_the_One%3F", + "start": "20200101", + "end": "20200201", + "country": "EN" } diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json index 3587e579822d0..d0e7de37580cb 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json @@ -1,5 +1,5 @@ { - "todo-stream-name": { - "todo-field-name": "value" + "per-article": { + "timestamp": "20210505" } } diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json index 33981dc35d733..6f1907a4713c6 100644 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json @@ -1,35 +1,27 @@ { "type": "object", "properties": { - "items": { - "type": "array", - "items": { - "type": "object", - "properties": { - "project": { - "type": "string" - }, - "access": { - "type": "string" - }, - "article": { - "type": "string" - }, - "agent": { - "type": "string" - }, - "granularity": { - "type": "string" - }, - "timestamp": { - "type": "string" - }, - "views": { - "type": "integer", - "format": "int64" - } - } - } + "project": { + "type": "string" + }, + "access": { + "type": "string" + }, + "article": { + "type": "string" + }, + "agent": { + "type": "string" + }, + "granularity": { + "type": "string" + }, + "timestamp": { + "type": "string" + }, + "views": { + "type": "integer", + "format": "int64" } }, "$schema": "http://json-schema.org/schema#" diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json index bd4904907a611..bb3641384605e 100644 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json @@ -1,44 +1,36 @@ { "type": "object", "properties": { - "items": { + "project": { + "type": "string" + }, + "access": { + "type": "string" + }, + "year": { + "type": "string" + }, + "month": { + "type": "string" + }, + "day": { + "type": "string" + }, + "articles": { "type": "array", "items": { "type": "object", "properties": { - "project": { - "type": "string" - }, - "access": { - "type": "string" - }, - "year": { - "type": "string" - }, - "month": { - "type": "string" + "rank": { + "type": "integer", + "format": "int32" }, - "day": { + "article": { "type": "string" }, - "articles": { - "type": "array", - "items": { - "type": "object", - "properties": { - "rank": { - "type": "integer", - "format": "int32" - }, - "article": { - "type": "string" - }, - "views": { - "type": "integer", - "format": "int64" - } - } - } + "views": { + "type": "integer", + "format": "int64" } } } diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml index 485ae300c761e..21bdd52ccbd4c 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml @@ -10,15 +10,13 @@ connectionSpecification: - article - start - end - - year - - month - - day - country additionalProperties: true properties: # 'TODO: This schema defines the configuration required for the source. This usually involves metadata such as database and/or authentication information.': project: type: string + title: Project description: If you want to filter by project, use the domain of any Wikimedia project. examples: - en.wikipedia.org @@ -26,6 +24,7 @@ connectionSpecification: - commons.wikimedia.org access: type: string + title: Access description: If you want to filter by access method, use one of desktop, mobile-app or mobile-web. If you are interested in pageviews regardless of access method, use all-access. examples: - all-access @@ -34,6 +33,7 @@ connectionSpecification: - mobile-web agent: type: string + title: Agent description: If you want to filter by agent type, use one of user, automated or spider. If you are interested in pageviews regardless of agent type, use all-agents. examples: - all-agents @@ -42,6 +42,7 @@ connectionSpecification: - automated article: type: string + title: Article description: The title of any article in the specified project. Any spaces should be replaced with underscores. It also should be URI-encoded, so that non-URI-safe characters like %, / or ? are accepted. examples: - Are_You_the_One%3F @@ -53,21 +54,24 @@ connectionSpecification: # - monthly start: type: string + title: Start description: The date of the first day to include, in YYYYMMDD or YYYYMMDDHH format. end: type: string + title: End description: The date of the last day to include, in YYYYMMDD or YYYYMMDDHH format. - year: - type: string - description: Year in YYYY format. - month: - type: string - description: Month in MM format. - day: - type: string - description: Day in DD format. + # year: + # type: string + # description: Year in YYYY format. + # month: + # type: string + # description: Month in MM format. + # day: + # type: string + # description: Day in DD format. country: type: string + title: Country description: The ISO 3166-1 alpha-2 code of a country for which to retrieve top articles. examples: - FR diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml index 86c5b2fa48348..6a1988c739895 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml @@ -10,13 +10,25 @@ definitions: request_options_provider: request_headers: "User-Agent": "AirbyteWikipediaPageviewsConnector/1.0 (https://github.com/airbytehq/airbyte)" - stream_slicer: + top_stream_slicer: + type: DatetimeStreamSlicer + start_datetime: + datetime: "{{config.start}}" + datetime_format: "%Y%m%d" + end_datetime: + datetime: "{{config.start}}" + datetime_format: "%Y%m%d" + step: "1d" + cursor_field: "timestamp" + datetime_format: "%Y/%m/%d" + per_article_stream_slicer: type: DatetimeStreamSlicer start_datetime: "{{config.start}}" end_datetime: "{{config.end}}" step: "1d" cursor_field: "timestamp" datetime_format: "%Y%m%d" + per_article_requester: $options: $ref: "*ref(definitions.requester)" @@ -24,8 +36,7 @@ definitions: top_requester: $options: $ref: "*ref(definitions.requester)" - path: "/top/{{config.project}}/{{config.access}}/{{config.year}}/{{config.month}}/{{config.day}}" - + path: "/top/{{config.project}}/{{config.access}}/{{stream_slice.start_time}}" per_article_retriever: record_selector: $ref: "*ref(definitions.selector)" @@ -34,7 +45,8 @@ definitions: requester: $ref: "*ref(definitions.per_article_requester)" stream_slicer: - $ref: "*ref(definitions.stream_slicer)" + $ref: "*ref(definitions.per_article_stream_slicer)" + top_retriever: record_selector: $ref: "*ref(definitions.selector)" @@ -42,6 +54,8 @@ definitions: type: NoPagination requester: $ref: "*ref(definitions.top_requester)" + stream_slicer: + $ref: "*ref(definitions.top_stream_slicer)" per_article_stream: retriever: $ref: "*ref(definitions.per_article_retriever)" From 9f452d761ba0ff59e0d008ef4ed4ee24f24d5865 Mon Sep 17 00:00:00 2001 From: ev Date: Fri, 28 Oct 2022 20:01:49 +0100 Subject: [PATCH 06/12] remove unneeded files --- .../integration_tests/catalog.json | 39 ------------------- .../schemas/TODO.md | 16 -------- .../wikipedia_pageviews.yaml | 2 +- 3 files changed, 1 insertion(+), 56 deletions(-) delete mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/catalog.json delete mode 100755 airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/TODO.md diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/catalog.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/catalog.json deleted file mode 100755 index 6799946a68514..0000000000000 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/catalog.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "streams": [ - { - "name": "TODO fix this file", - "supported_sync_modes": ["full_refresh", "incremental"], - "source_defined_cursor": true, - "default_cursor_field": "column1", - "json_schema": { - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "column1": { - "type": "string" - }, - "column2": { - "type": "number" - } - } - } - }, - { - "name": "table1", - "supported_sync_modes": ["full_refresh", "incremental"], - "source_defined_cursor": false, - "json_schema": { - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "column1": { - "type": "string" - }, - "column2": { - "type": "number" - } - } - } - } - ] -} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/TODO.md b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/TODO.md deleted file mode 100755 index d63eaa1f57ee6..0000000000000 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/TODO.md +++ /dev/null @@ -1,16 +0,0 @@ -# TODO: Define your stream schemas -Your connector must describe the schema of each stream it can output using [JSONSchema](https://json-schema.org). - -You can describe the schema of your streams using one `.json` file per stream. - -## Static schemas -From the `wikipedia_pageviews.yaml` configuration file, you read the `.json` files in the `schemas/` directory. You can refer to a schema in your configuration file using the `schema_loader` component's `file_path` field. For example: -``` -schema_loader: - type: JsonSchema - file_path: "./source_wikipedia_pageviews/schemas/customers.json" -``` -Every stream specified in the configuration file should have a corresponding `.json` schema file. - -Delete this file once you're done. Or don't. Up to you :) - diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml index 6a1988c739895..2bdb8f8bd165c 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml @@ -32,7 +32,7 @@ definitions: per_article_requester: $options: $ref: "*ref(definitions.requester)" - path: "/per-article/{{config.project}}/{{config.access}}/{{config.agent}}/{{config.article}}/daily/{{config.start}}/{{config.end}}" + path: "/per-article/{{config.project}}/{{config.access}}/{{config.agent}}/{{config.article}}/daily/{{stream_slice.start_time}}/{{stream_slice.end_time}}" top_requester: $options: $ref: "*ref(definitions.requester)" From 5957a50c0751b4c25d6b61caf05240d4f7910d5f Mon Sep 17 00:00:00 2001 From: ev Date: Mon, 31 Oct 2022 14:28:57 +0000 Subject: [PATCH 07/12] Increment the version. --- .../connectors/source-wikipedia-pageviews/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile b/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile index 1bc63f17e8b48..d17962e0cca9f 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile @@ -34,5 +34,5 @@ COPY source_wikipedia_pageviews ./source_wikipedia_pageviews ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.1.0 +LABEL io.airbyte.version=1.0.0 LABEL io.airbyte.name=airbyte/source-wikipedia-pageviews From 368c0e6cc9aaaf61dfb6e323eb21cfa94964d2db Mon Sep 17 00:00:00 2001 From: Ev Date: Mon, 31 Oct 2022 23:37:23 +0000 Subject: [PATCH 08/12] add docs and clean up unneeded comments --- .../source-wikipedia-pageviews/Dockerfile | 2 +- .../source_wikipedia_pageviews/spec.yaml | 16 ------ .../sources/wikipedia-pageviews.md | 53 +++++++++++++++++++ 3 files changed, 54 insertions(+), 17 deletions(-) create mode 100644 docs/integrations/sources/wikipedia-pageviews.md diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile b/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile index d17962e0cca9f..1bc63f17e8b48 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile @@ -34,5 +34,5 @@ COPY source_wikipedia_pageviews ./source_wikipedia_pageviews ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=1.0.0 +LABEL io.airbyte.version=0.1.0 LABEL io.airbyte.name=airbyte/source-wikipedia-pageviews diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml index 21bdd52ccbd4c..32bba6f7a92d8 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml @@ -13,7 +13,6 @@ connectionSpecification: - country additionalProperties: true properties: - # 'TODO: This schema defines the configuration required for the source. This usually involves metadata such as database and/or authentication information.': project: type: string title: Project @@ -46,12 +45,6 @@ connectionSpecification: description: The title of any article in the specified project. Any spaces should be replaced with underscores. It also should be URI-encoded, so that non-URI-safe characters like %, / or ? are accepted. examples: - Are_You_the_One%3F - # granularity: - # type: string - # description: The time unit for the response data. As of today, the only supported granularity for this endpoint is daily and monthly. - # examples: - # - daily - # - monthly start: type: string title: Start @@ -60,15 +53,6 @@ connectionSpecification: type: string title: End description: The date of the last day to include, in YYYYMMDD or YYYYMMDDHH format. - # year: - # type: string - # description: Year in YYYY format. - # month: - # type: string - # description: Month in MM format. - # day: - # type: string - # description: Day in DD format. country: type: string title: Country diff --git a/docs/integrations/sources/wikipedia-pageviews.md b/docs/integrations/sources/wikipedia-pageviews.md new file mode 100644 index 0000000000000..249204b283340 --- /dev/null +++ b/docs/integrations/sources/wikipedia-pageviews.md @@ -0,0 +1,53 @@ +# Wikipedia Pageviews + +This page contains the setup guide and reference information for the [Wikipedia Pageviews](https://wikimedia.org/api/rest_v1/#/Pageviews%20data) source connector. + +## Prerequisites + +None + +## Setup guide + +## Step 1: Set up the Courier connector in Airbyte + +### For Airbyte Cloud: + +1. [Log into your Airbyte Cloud](https://cloud.airbyte.io/workspaces) account. +2. In the left navigation bar, click **Sources**. In the top-right corner, click **+new source**. +3. On the Set up the source page, enter the name for the Courier connector and select **Wikipedia Pageviews** from the Source type dropdown. +4. Enter your parameters. +5. Click **Set up source**. + +### For Airbyte OSS: + +1. Navigate to the Airbyte Open Source dashboard. +2. Set the name for your source. +3. Enter your parameters. +4. Click **Set up source**. + +## Supported sync modes + +The Wikipedia Pageviews source connector supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts#connection-sync-modes): + +| Feature | Supported? | +| :---------------------------- | :--------- | +| Full Refresh Sync | Yes | +| Incremental Sync | No | +| Replicate Incremental Deletes | No | +| SSL connection | Yes | +| Namespaces | No | + +## Supported Streams + +- per-article +- top + +## Performance considerations + +100 req/s per endpoint. + +## Changelog + +| Version | Date | Pull Request | Subject | +| :------ | :--------- | :----------------------------------------------------- | :------------- | +| 0.1.0 | 2022-10-31 | [#18343](https://github.com/airbytehq/airbyte/pull/18343) | Initial commit | \ No newline at end of file From fae473deb56e668c1befc2a31aca524eb09ff897 Mon Sep 17 00:00:00 2001 From: ev Date: Thu, 3 Nov 2022 17:25:57 +0000 Subject: [PATCH 09/12] Bump airbyte-cdk version to 0.2 --- .../connectors/source-wikipedia-pageviews/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py index 0b7d663fc0336..b49e03eabfeba 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup MAIN_REQUIREMENTS = [ - "airbyte-cdk~=0.1", + "airbyte-cdk~=0.2", ] TEST_REQUIREMENTS = [ From 0ede1f589da8610014f9a26241a94a86f92d98d7 Mon Sep 17 00:00:00 2001 From: marcosmarxm Date: Thu, 3 Nov 2022 16:42:19 -0300 Subject: [PATCH 10/12] add schemaloader --- .../source_wikipedia_pageviews/wikipedia_pageviews.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml index 2bdb8f8bd165c..03a81cb06bf7e 100755 --- a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml @@ -57,11 +57,17 @@ definitions: stream_slicer: $ref: "*ref(definitions.top_stream_slicer)" per_article_stream: + schema_loader: + type: JsonSchema + file_path: "./source_wikipedia_pageviews/schemas/{{ options['name'] }}.json" retriever: $ref: "*ref(definitions.per_article_retriever)" $options: name: "per-article" top_stream: + schema_loader: + type: JsonSchema + file_path: "./source_wikipedia_pageviews/schemas/{{ options['name'] }}.json" retriever: $ref: "*ref(definitions.top_retriever)" $options: From 7e875bfc6dad3bb23f9ca0a86fa43433b5d32dd7 Mon Sep 17 00:00:00 2001 From: marcosmarxm Date: Thu, 3 Nov 2022 16:44:54 -0300 Subject: [PATCH 11/12] add wikipedia to source def --- .../init/src/main/resources/seed/source_definitions.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index 90c091333e32a..80dbf6455484a 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -1477,6 +1477,13 @@ documentationUrl: https://docs.airbyte.com/integrations/sources/waiteraid sourceType: api releaseStage: alpha +- name: Wikipedia Pageviews + sourceDefinitionId: 87c58f70-6f7a-4f70-aba5-bab1a458f5ba + dockerRepository: airbyte/source-wikipedia-pageviews + dockerImageTag: 0.1.0 + documentationUrl: https://docs.airbyte.com/integrations/sources/wikipedia-pageviews + sourceType: api + releaseStage: alpha - name: Yandex Metrica sourceDefinitionId: 7865dce4-2211-4f6a-88e5-9d0fe161afe7 dockerRepository: airbyte/source-yandex-metrica From c06465506cf8491fd73fab701c13be23062e11a5 Mon Sep 17 00:00:00 2001 From: Octavia Squidington III Date: Thu, 3 Nov 2022 20:05:08 +0000 Subject: [PATCH 12/12] auto-bump connector version --- .../src/main/resources/seed/source_specs.yaml | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index 82a4f6b866289..b98e95ac8dbcc 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -14163,6 +14163,83 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] +- dockerImage: "airbyte/source-wikipedia-pageviews:0.1.0" + spec: + documentationUrl: "https://docsurl.com" + connectionSpecification: + $schema: "http://json-schema.org/draft-07/schema#" + title: "Wikipedia Pageviews Spec" + type: "object" + required: + - "project" + - "access" + - "agent" + - "article" + - "start" + - "end" + - "country" + additionalProperties: true + properties: + project: + type: "string" + title: "Project" + description: "If you want to filter by project, use the domain of any Wikimedia\ + \ project." + examples: + - "en.wikipedia.org" + - "www.mediawiki.org" + - "commons.wikimedia.org" + access: + type: "string" + title: "Access" + description: "If you want to filter by access method, use one of desktop,\ + \ mobile-app or mobile-web. If you are interested in pageviews regardless\ + \ of access method, use all-access." + examples: + - "all-access" + - "desktop" + - "mobile-app" + - "mobile-web" + agent: + type: "string" + title: "Agent" + description: "If you want to filter by agent type, use one of user, automated\ + \ or spider. If you are interested in pageviews regardless of agent type,\ + \ use all-agents." + examples: + - "all-agents" + - "user" + - "spider" + - "automated" + article: + type: "string" + title: "Article" + description: "The title of any article in the specified project. Any spaces\ + \ should be replaced with underscores. It also should be URI-encoded,\ + \ so that non-URI-safe characters like %, / or ? are accepted." + examples: + - "Are_You_the_One%3F" + start: + type: "string" + title: "Start" + description: "The date of the first day to include, in YYYYMMDD or YYYYMMDDHH\ + \ format." + end: + type: "string" + title: "End" + description: "The date of the last day to include, in YYYYMMDD or YYYYMMDDHH\ + \ format." + country: + type: "string" + title: "Country" + description: "The ISO 3166-1 alpha-2 code of a country for which to retrieve\ + \ top articles." + examples: + - "FR" + - "IN" + supportsNormalization: false + supportsDBT: false + supported_destination_sync_modes: [] - dockerImage: "airbyte/source-yandex-metrica:0.1.0" spec: documentationUrl: "https://docsurl.com"