diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml index 90c091333e32a..80dbf6455484a 100644 --- a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml @@ -1477,6 +1477,13 @@ documentationUrl: https://docs.airbyte.com/integrations/sources/waiteraid sourceType: api releaseStage: alpha +- name: Wikipedia Pageviews + sourceDefinitionId: 87c58f70-6f7a-4f70-aba5-bab1a458f5ba + dockerRepository: airbyte/source-wikipedia-pageviews + dockerImageTag: 0.1.0 + documentationUrl: https://docs.airbyte.com/integrations/sources/wikipedia-pageviews + sourceType: api + releaseStage: alpha - name: Yandex Metrica sourceDefinitionId: 7865dce4-2211-4f6a-88e5-9d0fe161afe7 dockerRepository: airbyte/source-yandex-metrica diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml index 82a4f6b866289..b98e95ac8dbcc 100644 --- a/airbyte-config/init/src/main/resources/seed/source_specs.yaml +++ b/airbyte-config/init/src/main/resources/seed/source_specs.yaml @@ -14163,6 +14163,83 @@ supportsNormalization: false supportsDBT: false supported_destination_sync_modes: [] +- dockerImage: "airbyte/source-wikipedia-pageviews:0.1.0" + spec: + documentationUrl: "https://docsurl.com" + connectionSpecification: + $schema: "http://json-schema.org/draft-07/schema#" + title: "Wikipedia Pageviews Spec" + type: "object" + required: + - "project" + - "access" + - "agent" + - "article" + - "start" + - "end" + - "country" + additionalProperties: true + properties: + project: + type: "string" + title: "Project" + description: "If you want to filter by project, use the domain of any Wikimedia\ + \ project." + examples: + - "en.wikipedia.org" + - "www.mediawiki.org" + - "commons.wikimedia.org" + access: + type: "string" + title: "Access" + description: "If you want to filter by access method, use one of desktop,\ + \ mobile-app or mobile-web. If you are interested in pageviews regardless\ + \ of access method, use all-access." + examples: + - "all-access" + - "desktop" + - "mobile-app" + - "mobile-web" + agent: + type: "string" + title: "Agent" + description: "If you want to filter by agent type, use one of user, automated\ + \ or spider. If you are interested in pageviews regardless of agent type,\ + \ use all-agents." + examples: + - "all-agents" + - "user" + - "spider" + - "automated" + article: + type: "string" + title: "Article" + description: "The title of any article in the specified project. Any spaces\ + \ should be replaced with underscores. It also should be URI-encoded,\ + \ so that non-URI-safe characters like %, / or ? are accepted." + examples: + - "Are_You_the_One%3F" + start: + type: "string" + title: "Start" + description: "The date of the first day to include, in YYYYMMDD or YYYYMMDDHH\ + \ format." + end: + type: "string" + title: "End" + description: "The date of the last day to include, in YYYYMMDD or YYYYMMDDHH\ + \ format." + country: + type: "string" + title: "Country" + description: "The ISO 3166-1 alpha-2 code of a country for which to retrieve\ + \ top articles." + examples: + - "FR" + - "IN" + supportsNormalization: false + supportsDBT: false + supported_destination_sync_modes: [] - dockerImage: "airbyte/source-yandex-metrica:0.1.0" spec: documentationUrl: "https://docsurl.com" diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/.dockerignore b/airbyte-integrations/connectors/source-wikipedia-pageviews/.dockerignore new file mode 100755 index 0000000000000..ef12d74d21bcc --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/.dockerignore @@ -0,0 +1,6 @@ +* +!Dockerfile +!main.py +!source_wikipedia_pageviews +!setup.py +!secrets diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile b/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile new file mode 100755 index 0000000000000..1bc63f17e8b48 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.9.11-alpine3.15 as base + +# build and load all requirements +FROM base as builder +WORKDIR /airbyte/integration_code + +# upgrade pip to the latest version +RUN apk --no-cache upgrade \ + && pip install --upgrade pip \ + && apk --no-cache add tzdata build-base + + +COPY setup.py ./ +# install necessary packages to a temporary folder +RUN pip install --prefix=/install . + +# build a clean environment +FROM base +WORKDIR /airbyte/integration_code + +# copy all loaded and built libraries to a pure basic image +COPY --from=builder /install /usr/local +# add default timezone settings +COPY --from=builder /usr/share/zoneinfo/Etc/UTC /etc/localtime +RUN echo "Etc/UTC" > /etc/timezone + +# bash is installed for more convenient debugging. +RUN apk --no-cache add bash + +# copy payload code only +COPY main.py ./ +COPY source_wikipedia_pageviews ./source_wikipedia_pageviews + +ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" +ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] + +LABEL io.airbyte.version=0.1.0 +LABEL io.airbyte.name=airbyte/source-wikipedia-pageviews diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/README.md b/airbyte-integrations/connectors/source-wikipedia-pageviews/README.md new file mode 100755 index 0000000000000..03cbba6a4e21c --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/README.md @@ -0,0 +1,79 @@ +# Wikipedia Pageviews Source + +This is the repository for the Wikipedia Pageviews configuration based source connector. +For information about how to use this connector within Airbyte, see [the documentation](https://docs.airbyte.io/integrations/sources/wikipedia-pageviews). + +## Local development + +#### Building via Gradle +You can also build the connector in Gradle. This is typically used in CI and not needed for your development workflow. + +To build using Gradle, from the Airbyte repository root, run: +``` +./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:build +``` + +#### Create credentials +**If you are a community contributor**, follow the instructions in the [documentation](https://docs.airbyte.io/integrations/sources/wikipedia-pageviews) +to generate the necessary credentials. Then create a file `secrets/config.json` conforming to the `source_wikipedia_pageviews/spec.yaml` file. +Note that any directory named `secrets` is gitignored across the entire Airbyte repo, so there is no danger of accidentally checking in sensitive information. +See `integration_tests/sample_config.json` for a sample config file. + +**If you are an Airbyte core member**, copy the credentials in Lastpass under the secret name `source wikipedia-pageviews test creds` +and place them into `secrets/config.json`. + +### Locally running the connector docker image + +#### Build +First, make sure you build the latest Docker image: +``` +docker build . -t airbyte/source-wikipedia-pageviews:dev +``` + +You can also build the connector image via Gradle: +``` +./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:airbyteDocker +``` +When building via Gradle, the docker image name and tag, respectively, are the values of the `io.airbyte.name` and `io.airbyte.version` `LABEL`s in +the Dockerfile. + +#### Run +Then run any of the connector commands as follows: +``` +docker run --rm airbyte/source-wikipedia-pageviews:dev spec +docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-wikipedia-pageviews:dev check --config /secrets/config.json +docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-wikipedia-pageviews:dev discover --config /secrets/config.json +docker run --rm -v $(pwd)/secrets:/secrets -v $(pwd)/integration_tests:/integration_tests airbyte/source-wikipedia-pageviews:dev read --config /secrets/config.json --catalog /integration_tests/configured_catalog.json +``` +## Testing + +#### Acceptance Tests +Customize `acceptance-test-config.yml` file to configure tests. See [Source Acceptance Tests](https://docs.airbyte.io/connector-development/testing-connectors/source-acceptance-tests-reference) for more information. +If your connector requires to create or destroy resources for use during acceptance tests create fixtures for it and place them inside integration_tests/acceptance.py. + +To run your integration tests with docker + +### Using gradle to run tests +All commands should be run from airbyte project root. +To run unit tests: +``` +./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:unitTest +``` +To run acceptance and custom integration tests: +``` +./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:integrationTest +``` + +## Dependency Management +All of your dependencies should go in `setup.py`, NOT `requirements.txt`. The requirements file is only used to connect internal Airbyte dependencies in the monorepo for local development. +We split dependencies between two groups, dependencies that are: +* required for your connector to work need to go to `MAIN_REQUIREMENTS` list. +* required for the testing need to go to `TEST_REQUIREMENTS` list + +### Publishing a new version of the connector +You've checked out the repo, implemented a million dollar feature, and you're ready to share your changes with the world. Now what? +1. Make sure your changes are passing unit and integration tests. +1. Bump the connector version in `Dockerfile` -- just increment the value of the `LABEL io.airbyte.version` appropriately (we use [SemVer](https://semver.org/)). +1. Create a Pull Request. +1. Pat yourself on the back for being an awesome contributor. +1. Someone from Airbyte will take a look at your PR and iterate with you to merge it into master. diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/__init__.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/__init__.py new file mode 100755 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml new file mode 100755 index 0000000000000..09ee4ecd34518 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-config.yml @@ -0,0 +1,30 @@ +# See [Source Acceptance Tests](https://docs.airbyte.io/connector-development/testing-connectors/source-acceptance-tests-reference) +# for more information about how to configure these tests +connector_image: airbyte/source-wikipedia-pageviews:dev +tests: + spec: + - spec_path: "source_wikipedia_pageviews/spec.yaml" + connection: + - config_path: "secrets/config.json" + status: "succeed" + - config_path: "integration_tests/invalid_config.json" + status: "failed" + discovery: + - config_path: "secrets/config.json" + basic_read: + - config_path: "secrets/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" + empty_streams: [] + # TODO uncomment this block to specify that the tests should assert the connector outputs the records provided in the input file a file + # expect_records: + # path: "integration_tests/expected_records.txt" + # extra_fields: no + # exact_order: no + # extra_records: yes + # incremental: # TODO if your connector does not implement incremental sync, remove this block + # - config_path: "secrets/config.json" + # configured_catalog_path: "integration_tests/configured_catalog.json" + # future_state_path: "integration_tests/abnormal_state.json" + full_refresh: + - config_path: "secrets/config.json" + configured_catalog_path: "integration_tests/configured_catalog.json" diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-docker.sh b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-docker.sh new file mode 100755 index 0000000000000..c51577d10690c --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/acceptance-test-docker.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env sh + +# Build latest connector image +docker build . -t $(cat acceptance-test-config.yml | grep "connector_image" | head -n 1 | cut -d: -f2-) + +# Pull latest acctest image +docker pull airbyte/source-acceptance-test:latest + +# Run +docker run --rm -it \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -v /tmp:/tmp \ + -v $(pwd):/test_input \ + airbyte/source-acceptance-test \ + --acceptance-test-config /test_input + diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/build.gradle b/airbyte-integrations/connectors/source-wikipedia-pageviews/build.gradle new file mode 100755 index 0000000000000..2900db7ffacb8 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/build.gradle @@ -0,0 +1,9 @@ +plugins { + id 'airbyte-python' + id 'airbyte-docker' + id 'airbyte-source-acceptance-test' +} + +airbytePython { + moduleDirectory 'source_wikipedia_pageviews' +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/__init__.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/__init__.py new file mode 100755 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json new file mode 100755 index 0000000000000..43c016b9bfaa0 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/abnormal_state.json @@ -0,0 +1,5 @@ +{ + "per-article": { + "timestamp": "2099022700" + } +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/acceptance.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/acceptance.py new file mode 100755 index 0000000000000..1302b2f57e10e --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/acceptance.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +import pytest + +pytest_plugins = ("source_acceptance_test.plugin",) + + +@pytest.fixture(scope="session", autouse=True) +def connector_setup(): + """This fixture is a placeholder for external resources that acceptance test might require.""" + # TODO: setup test dependencies if needed. otherwise remove the TODO comments + yield + # TODO: clean up test dependencies diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json new file mode 100755 index 0000000000000..f97acaffc7d75 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/configured_catalog.json @@ -0,0 +1,22 @@ +{ + "streams": [ + { + "stream": { + "name": "per-article", + "json_schema": {}, + "supported_sync_modes": ["full_refresh"] + }, + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite" + }, + { + "stream": { + "name": "top", + "json_schema": {}, + "supported_sync_modes": ["full_refresh"] + }, + "sync_mode": "full_refresh", + "destination_sync_mode": "overwrite" + } + ] +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json new file mode 100755 index 0000000000000..7040094d9ba61 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/invalid_config.json @@ -0,0 +1,9 @@ +{ + "project": "en.wikipedia.org", + "access": "all-access", + "agent": "all-agents", + "article": "Are_You_the_One%3F", + "start": "", + "end": "202440101", + "country": "EN" +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json new file mode 100755 index 0000000000000..e5628b5061820 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_config.json @@ -0,0 +1,9 @@ +{ + "project": "en.wikipedia.org", + "access": "all-access", + "agent": "all-agents", + "article": "Are_You_the_One%3F", + "start": "20200101", + "end": "20200201", + "country": "EN" +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json new file mode 100755 index 0000000000000..d0e7de37580cb --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/integration_tests/sample_state.json @@ -0,0 +1,5 @@ +{ + "per-article": { + "timestamp": "20210505" + } +} diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/main.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/main.py new file mode 100755 index 0000000000000..6cd40d0b94cae --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/main.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +import sys + +from airbyte_cdk.entrypoint import launch +from source_wikipedia_pageviews import SourceWikipediaPageviews + +if __name__ == "__main__": + source = SourceWikipediaPageviews() + launch(source, sys.argv[1:]) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/requirements.txt b/airbyte-integrations/connectors/source-wikipedia-pageviews/requirements.txt new file mode 100755 index 0000000000000..0411042aa0911 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/requirements.txt @@ -0,0 +1,2 @@ +-e ../../bases/source-acceptance-test +-e . diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py new file mode 100755 index 0000000000000..b49e03eabfeba --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/setup.py @@ -0,0 +1,29 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from setuptools import find_packages, setup + +MAIN_REQUIREMENTS = [ + "airbyte-cdk~=0.2", +] + +TEST_REQUIREMENTS = [ + "pytest~=6.1", + "pytest-mock~=3.6.1", + "source-acceptance-test", +] + +setup( + name="source_wikipedia_pageviews", + description="Source implementation for Wikipedia Pageviews.", + author="Airbyte", + author_email="contact@airbyte.io", + packages=find_packages(), + install_requires=MAIN_REQUIREMENTS, + package_data={"": ["*.json", "*.yaml", "schemas/*.json", "schemas/shared/*.json"]}, + extras_require={ + "tests": TEST_REQUIREMENTS, + }, +) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/__init__.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/__init__.py new file mode 100755 index 0000000000000..d52fb1e5ee55b --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from .source import SourceWikipediaPageviews + +__all__ = ["SourceWikipediaPageviews"] diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json new file mode 100644 index 0000000000000..6f1907a4713c6 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/per-article.json @@ -0,0 +1,28 @@ +{ + "type": "object", + "properties": { + "project": { + "type": "string" + }, + "access": { + "type": "string" + }, + "article": { + "type": "string" + }, + "agent": { + "type": "string" + }, + "granularity": { + "type": "string" + }, + "timestamp": { + "type": "string" + }, + "views": { + "type": "integer", + "format": "int64" + } + }, + "$schema": "http://json-schema.org/schema#" +} \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json new file mode 100644 index 0000000000000..bb3641384605e --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/schemas/top.json @@ -0,0 +1,40 @@ +{ + "type": "object", + "properties": { + "project": { + "type": "string" + }, + "access": { + "type": "string" + }, + "year": { + "type": "string" + }, + "month": { + "type": "string" + }, + "day": { + "type": "string" + }, + "articles": { + "type": "array", + "items": { + "type": "object", + "properties": { + "rank": { + "type": "integer", + "format": "int32" + }, + "article": { + "type": "string" + }, + "views": { + "type": "integer", + "format": "int64" + } + } + } + } + }, + "$schema": "http://json-schema.org/schema#" +} \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/source.py b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/source.py new file mode 100755 index 0000000000000..19ed7ee95fb34 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/source.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource + +""" +This file provides the necessary constructs to interpret a provided declarative YAML configuration file into +source connector. + +WARNING: Do not modify this file. +""" + + +# Declarative Source +class SourceWikipediaPageviews(YamlDeclarativeSource): + def __init__(self): + super().__init__(**{"path_to_yaml": "wikipedia_pageviews.yaml"}) diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml new file mode 100755 index 0000000000000..32bba6f7a92d8 --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/spec.yaml @@ -0,0 +1,62 @@ +documentationUrl: https://docsurl.com +connectionSpecification: + $schema: http://json-schema.org/draft-07/schema# + title: Wikipedia Pageviews Spec + type: object + required: + - project + - access + - agent + - article + - start + - end + - country + additionalProperties: true + properties: + project: + type: string + title: Project + description: If you want to filter by project, use the domain of any Wikimedia project. + examples: + - en.wikipedia.org + - www.mediawiki.org + - commons.wikimedia.org + access: + type: string + title: Access + description: If you want to filter by access method, use one of desktop, mobile-app or mobile-web. If you are interested in pageviews regardless of access method, use all-access. + examples: + - all-access + - desktop + - mobile-app + - mobile-web + agent: + type: string + title: Agent + description: If you want to filter by agent type, use one of user, automated or spider. If you are interested in pageviews regardless of agent type, use all-agents. + examples: + - all-agents + - user + - spider + - automated + article: + type: string + title: Article + description: The title of any article in the specified project. Any spaces should be replaced with underscores. It also should be URI-encoded, so that non-URI-safe characters like %, / or ? are accepted. + examples: + - Are_You_the_One%3F + start: + type: string + title: Start + description: The date of the first day to include, in YYYYMMDD or YYYYMMDDHH format. + end: + type: string + title: End + description: The date of the last day to include, in YYYYMMDD or YYYYMMDDHH format. + country: + type: string + title: Country + description: The ISO 3166-1 alpha-2 code of a country for which to retrieve top articles. + examples: + - FR + - IN diff --git a/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml new file mode 100755 index 0000000000000..03a81cb06bf7e --- /dev/null +++ b/airbyte-integrations/connectors/source-wikipedia-pageviews/source_wikipedia_pageviews/wikipedia_pageviews.yaml @@ -0,0 +1,83 @@ +version: "0.1.0" + +definitions: + selector: + extractor: + field_pointer: [ "items" ] + requester: + url_base: "https://wikimedia.org/api/rest_v1/metrics/pageviews" + http_method: "GET" + request_options_provider: + request_headers: + "User-Agent": "AirbyteWikipediaPageviewsConnector/1.0 (https://github.com/airbytehq/airbyte)" + top_stream_slicer: + type: DatetimeStreamSlicer + start_datetime: + datetime: "{{config.start}}" + datetime_format: "%Y%m%d" + end_datetime: + datetime: "{{config.start}}" + datetime_format: "%Y%m%d" + step: "1d" + cursor_field: "timestamp" + datetime_format: "%Y/%m/%d" + per_article_stream_slicer: + type: DatetimeStreamSlicer + start_datetime: "{{config.start}}" + end_datetime: "{{config.end}}" + step: "1d" + cursor_field: "timestamp" + datetime_format: "%Y%m%d" + + per_article_requester: + $options: + $ref: "*ref(definitions.requester)" + path: "/per-article/{{config.project}}/{{config.access}}/{{config.agent}}/{{config.article}}/daily/{{stream_slice.start_time}}/{{stream_slice.end_time}}" + top_requester: + $options: + $ref: "*ref(definitions.requester)" + path: "/top/{{config.project}}/{{config.access}}/{{stream_slice.start_time}}" + per_article_retriever: + record_selector: + $ref: "*ref(definitions.selector)" + paginator: + type: NoPagination + requester: + $ref: "*ref(definitions.per_article_requester)" + stream_slicer: + $ref: "*ref(definitions.per_article_stream_slicer)" + + top_retriever: + record_selector: + $ref: "*ref(definitions.selector)" + paginator: + type: NoPagination + requester: + $ref: "*ref(definitions.top_requester)" + stream_slicer: + $ref: "*ref(definitions.top_stream_slicer)" + per_article_stream: + schema_loader: + type: JsonSchema + file_path: "./source_wikipedia_pageviews/schemas/{{ options['name'] }}.json" + retriever: + $ref: "*ref(definitions.per_article_retriever)" + $options: + name: "per-article" + top_stream: + schema_loader: + type: JsonSchema + file_path: "./source_wikipedia_pageviews/schemas/{{ options['name'] }}.json" + retriever: + $ref: "*ref(definitions.top_retriever)" + $options: + name: "top" + +streams: + - "*ref(definitions.per_article_stream)" + - "*ref(definitions.top_stream)" + +check: + stream_names: + - "per-article" + - "top" diff --git a/docs/integrations/sources/wikipedia-pageviews.md b/docs/integrations/sources/wikipedia-pageviews.md new file mode 100644 index 0000000000000..249204b283340 --- /dev/null +++ b/docs/integrations/sources/wikipedia-pageviews.md @@ -0,0 +1,53 @@ +# Wikipedia Pageviews + +This page contains the setup guide and reference information for the [Wikipedia Pageviews](https://wikimedia.org/api/rest_v1/#/Pageviews%20data) source connector. + +## Prerequisites + +None + +## Setup guide + +## Step 1: Set up the Courier connector in Airbyte + +### For Airbyte Cloud: + +1. [Log into your Airbyte Cloud](https://cloud.airbyte.io/workspaces) account. +2. In the left navigation bar, click **Sources**. In the top-right corner, click **+new source**. +3. On the Set up the source page, enter the name for the Courier connector and select **Wikipedia Pageviews** from the Source type dropdown. +4. Enter your parameters. +5. Click **Set up source**. + +### For Airbyte OSS: + +1. Navigate to the Airbyte Open Source dashboard. +2. Set the name for your source. +3. Enter your parameters. +4. Click **Set up source**. + +## Supported sync modes + +The Wikipedia Pageviews source connector supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts#connection-sync-modes): + +| Feature | Supported? | +| :---------------------------- | :--------- | +| Full Refresh Sync | Yes | +| Incremental Sync | No | +| Replicate Incremental Deletes | No | +| SSL connection | Yes | +| Namespaces | No | + +## Supported Streams + +- per-article +- top + +## Performance considerations + +100 req/s per endpoint. + +## Changelog + +| Version | Date | Pull Request | Subject | +| :------ | :--------- | :----------------------------------------------------- | :------------- | +| 0.1.0 | 2022-10-31 | [#18343](https://github.com/airbytehq/airbyte/pull/18343) | Initial commit | \ No newline at end of file