Skip to content

Commit f7bef97

Browse files
evwltrsmarcosmarxmoctavia-squidington-iii
authored
New Source: Wikipedia Pageviews (#18343)
* start adding connector * fix check and stream slicer * pass some tests * fix: github url Co-authored-by: Marcos Marx <[email protected]> * add stream slicer for top and fix some tests * remove unneeded files * Increment the version. * add docs and clean up unneeded comments * Bump airbyte-cdk version to 0.2 * add schemaloader * add wikipedia to source def * auto-bump connector version Co-authored-by: Marcos Marx <[email protected]> Co-authored-by: marcosmarxm <[email protected]> Co-authored-by: Octavia Squidington III <[email protected]>
1 parent 5b0ed4f commit f7bef97

File tree

26 files changed

+670
-0
lines changed

26 files changed

+670
-0
lines changed

airbyte-config/init/src/main/resources/seed/source_definitions.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,6 +1484,13 @@
14841484
documentationUrl: https://docs.airbyte.com/integrations/sources/waiteraid
14851485
sourceType: api
14861486
releaseStage: alpha
1487+
- name: Wikipedia Pageviews
1488+
sourceDefinitionId: 87c58f70-6f7a-4f70-aba5-bab1a458f5ba
1489+
dockerRepository: airbyte/source-wikipedia-pageviews
1490+
dockerImageTag: 0.1.0
1491+
documentationUrl: https://docs.airbyte.com/integrations/sources/wikipedia-pageviews
1492+
sourceType: api
1493+
releaseStage: alpha
14871494
- name: Yandex Metrica
14881495
sourceDefinitionId: 7865dce4-2211-4f6a-88e5-9d0fe161afe7
14891496
dockerRepository: airbyte/source-yandex-metrica

airbyte-config/init/src/main/resources/seed/source_specs.yaml

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14185,6 +14185,83 @@
1418514185
supportsNormalization: false
1418614186
supportsDBT: false
1418714187
supported_destination_sync_modes: []
14188+
- dockerImage: "airbyte/source-wikipedia-pageviews:0.1.0"
14189+
spec:
14190+
documentationUrl: "https://docsurl.com"
14191+
connectionSpecification:
14192+
$schema: "http://json-schema.org/draft-07/schema#"
14193+
title: "Wikipedia Pageviews Spec"
14194+
type: "object"
14195+
required:
14196+
- "project"
14197+
- "access"
14198+
- "agent"
14199+
- "article"
14200+
- "start"
14201+
- "end"
14202+
- "country"
14203+
additionalProperties: true
14204+
properties:
14205+
project:
14206+
type: "string"
14207+
title: "Project"
14208+
description: "If you want to filter by project, use the domain of any Wikimedia\
14209+
\ project."
14210+
examples:
14211+
- "en.wikipedia.org"
14212+
- "www.mediawiki.org"
14213+
- "commons.wikimedia.org"
14214+
access:
14215+
type: "string"
14216+
title: "Access"
14217+
description: "If you want to filter by access method, use one of desktop,\
14218+
\ mobile-app or mobile-web. If you are interested in pageviews regardless\
14219+
\ of access method, use all-access."
14220+
examples:
14221+
- "all-access"
14222+
- "desktop"
14223+
- "mobile-app"
14224+
- "mobile-web"
14225+
agent:
14226+
type: "string"
14227+
title: "Agent"
14228+
description: "If you want to filter by agent type, use one of user, automated\
14229+
\ or spider. If you are interested in pageviews regardless of agent type,\
14230+
\ use all-agents."
14231+
examples:
14232+
- "all-agents"
14233+
- "user"
14234+
- "spider"
14235+
- "automated"
14236+
article:
14237+
type: "string"
14238+
title: "Article"
14239+
description: "The title of any article in the specified project. Any spaces\
14240+
\ should be replaced with underscores. It also should be URI-encoded,\
14241+
\ so that non-URI-safe characters like %, / or ? are accepted."
14242+
examples:
14243+
- "Are_You_the_One%3F"
14244+
start:
14245+
type: "string"
14246+
title: "Start"
14247+
description: "The date of the first day to include, in YYYYMMDD or YYYYMMDDHH\
14248+
\ format."
14249+
end:
14250+
type: "string"
14251+
title: "End"
14252+
description: "The date of the last day to include, in YYYYMMDD or YYYYMMDDHH\
14253+
\ format."
14254+
country:
14255+
type: "string"
14256+
title: "Country"
14257+
description: "The ISO 3166-1 alpha-2 code of a country for which to retrieve\
14258+
\ top articles."
14259+
examples:
14260+
- "FR"
14261+
- "IN"
14262+
supportsNormalization: false
14263+
supportsDBT: false
14264+
supported_destination_sync_modes: []
1418814265
- dockerImage: "airbyte/source-yandex-metrica:0.1.0"
1418914266
spec:
1419014267
documentationUrl: "https://docsurl.com"
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
*
2+
!Dockerfile
3+
!main.py
4+
!source_wikipedia_pageviews
5+
!setup.py
6+
!secrets
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
FROM python:3.9.11-alpine3.15 as base
2+
3+
# build and load all requirements
4+
FROM base as builder
5+
WORKDIR /airbyte/integration_code
6+
7+
# upgrade pip to the latest version
8+
RUN apk --no-cache upgrade \
9+
&& pip install --upgrade pip \
10+
&& apk --no-cache add tzdata build-base
11+
12+
13+
COPY setup.py ./
14+
# install necessary packages to a temporary folder
15+
RUN pip install --prefix=/install .
16+
17+
# build a clean environment
18+
FROM base
19+
WORKDIR /airbyte/integration_code
20+
21+
# copy all loaded and built libraries to a pure basic image
22+
COPY --from=builder /install /usr/local
23+
# add default timezone settings
24+
COPY --from=builder /usr/share/zoneinfo/Etc/UTC /etc/localtime
25+
RUN echo "Etc/UTC" > /etc/timezone
26+
27+
# bash is installed for more convenient debugging.
28+
RUN apk --no-cache add bash
29+
30+
# copy payload code only
31+
COPY main.py ./
32+
COPY source_wikipedia_pageviews ./source_wikipedia_pageviews
33+
34+
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
35+
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
36+
37+
LABEL io.airbyte.version=0.1.0
38+
LABEL io.airbyte.name=airbyte/source-wikipedia-pageviews
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Wikipedia Pageviews Source
2+
3+
This is the repository for the Wikipedia Pageviews configuration based source connector.
4+
For information about how to use this connector within Airbyte, see [the documentation](https://docs.airbyte.io/integrations/sources/wikipedia-pageviews).
5+
6+
## Local development
7+
8+
#### Building via Gradle
9+
You can also build the connector in Gradle. This is typically used in CI and not needed for your development workflow.
10+
11+
To build using Gradle, from the Airbyte repository root, run:
12+
```
13+
./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:build
14+
```
15+
16+
#### Create credentials
17+
**If you are a community contributor**, follow the instructions in the [documentation](https://docs.airbyte.io/integrations/sources/wikipedia-pageviews)
18+
to generate the necessary credentials. Then create a file `secrets/config.json` conforming to the `source_wikipedia_pageviews/spec.yaml` file.
19+
Note that any directory named `secrets` is gitignored across the entire Airbyte repo, so there is no danger of accidentally checking in sensitive information.
20+
See `integration_tests/sample_config.json` for a sample config file.
21+
22+
**If you are an Airbyte core member**, copy the credentials in Lastpass under the secret name `source wikipedia-pageviews test creds`
23+
and place them into `secrets/config.json`.
24+
25+
### Locally running the connector docker image
26+
27+
#### Build
28+
First, make sure you build the latest Docker image:
29+
```
30+
docker build . -t airbyte/source-wikipedia-pageviews:dev
31+
```
32+
33+
You can also build the connector image via Gradle:
34+
```
35+
./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:airbyteDocker
36+
```
37+
When building via Gradle, the docker image name and tag, respectively, are the values of the `io.airbyte.name` and `io.airbyte.version` `LABEL`s in
38+
the Dockerfile.
39+
40+
#### Run
41+
Then run any of the connector commands as follows:
42+
```
43+
docker run --rm airbyte/source-wikipedia-pageviews:dev spec
44+
docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-wikipedia-pageviews:dev check --config /secrets/config.json
45+
docker run --rm -v $(pwd)/secrets:/secrets airbyte/source-wikipedia-pageviews:dev discover --config /secrets/config.json
46+
docker run --rm -v $(pwd)/secrets:/secrets -v $(pwd)/integration_tests:/integration_tests airbyte/source-wikipedia-pageviews:dev read --config /secrets/config.json --catalog /integration_tests/configured_catalog.json
47+
```
48+
## Testing
49+
50+
#### Acceptance Tests
51+
Customize `acceptance-test-config.yml` file to configure tests. See [Source Acceptance Tests](https://docs.airbyte.io/connector-development/testing-connectors/source-acceptance-tests-reference) for more information.
52+
If your connector requires to create or destroy resources for use during acceptance tests create fixtures for it and place them inside integration_tests/acceptance.py.
53+
54+
To run your integration tests with docker
55+
56+
### Using gradle to run tests
57+
All commands should be run from airbyte project root.
58+
To run unit tests:
59+
```
60+
./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:unitTest
61+
```
62+
To run acceptance and custom integration tests:
63+
```
64+
./gradlew :airbyte-integrations:connectors:source-wikipedia-pageviews:integrationTest
65+
```
66+
67+
## Dependency Management
68+
All of your dependencies should go in `setup.py`, NOT `requirements.txt`. The requirements file is only used to connect internal Airbyte dependencies in the monorepo for local development.
69+
We split dependencies between two groups, dependencies that are:
70+
* required for your connector to work need to go to `MAIN_REQUIREMENTS` list.
71+
* required for the testing need to go to `TEST_REQUIREMENTS` list
72+
73+
### Publishing a new version of the connector
74+
You've checked out the repo, implemented a million dollar feature, and you're ready to share your changes with the world. Now what?
75+
1. Make sure your changes are passing unit and integration tests.
76+
1. Bump the connector version in `Dockerfile` -- just increment the value of the `LABEL io.airbyte.version` appropriately (we use [SemVer](https://semver.org/)).
77+
1. Create a Pull Request.
78+
1. Pat yourself on the back for being an awesome contributor.
79+
1. Someone from Airbyte will take a look at your PR and iterate with you to merge it into master.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#
2+
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3+
#
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# See [Source Acceptance Tests](https://docs.airbyte.io/connector-development/testing-connectors/source-acceptance-tests-reference)
2+
# for more information about how to configure these tests
3+
connector_image: airbyte/source-wikipedia-pageviews:dev
4+
tests:
5+
spec:
6+
- spec_path: "source_wikipedia_pageviews/spec.yaml"
7+
connection:
8+
- config_path: "secrets/config.json"
9+
status: "succeed"
10+
- config_path: "integration_tests/invalid_config.json"
11+
status: "failed"
12+
discovery:
13+
- config_path: "secrets/config.json"
14+
basic_read:
15+
- config_path: "secrets/config.json"
16+
configured_catalog_path: "integration_tests/configured_catalog.json"
17+
empty_streams: []
18+
# TODO uncomment this block to specify that the tests should assert the connector outputs the records provided in the input file a file
19+
# expect_records:
20+
# path: "integration_tests/expected_records.txt"
21+
# extra_fields: no
22+
# exact_order: no
23+
# extra_records: yes
24+
# incremental: # TODO if your connector does not implement incremental sync, remove this block
25+
# - config_path: "secrets/config.json"
26+
# configured_catalog_path: "integration_tests/configured_catalog.json"
27+
# future_state_path: "integration_tests/abnormal_state.json"
28+
full_refresh:
29+
- config_path: "secrets/config.json"
30+
configured_catalog_path: "integration_tests/configured_catalog.json"
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env sh
2+
3+
# Build latest connector image
4+
docker build . -t $(cat acceptance-test-config.yml | grep "connector_image" | head -n 1 | cut -d: -f2-)
5+
6+
# Pull latest acctest image
7+
docker pull airbyte/source-acceptance-test:latest
8+
9+
# Run
10+
docker run --rm -it \
11+
-v /var/run/docker.sock:/var/run/docker.sock \
12+
-v /tmp:/tmp \
13+
-v $(pwd):/test_input \
14+
airbyte/source-acceptance-test \
15+
--acceptance-test-config /test_input
16+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
plugins {
2+
id 'airbyte-python'
3+
id 'airbyte-docker'
4+
id 'airbyte-source-acceptance-test'
5+
}
6+
7+
airbytePython {
8+
moduleDirectory 'source_wikipedia_pageviews'
9+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#
2+
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3+
#
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"per-article": {
3+
"timestamp": "2099022700"
4+
}
5+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#
2+
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3+
#
4+
5+
6+
import pytest
7+
8+
pytest_plugins = ("source_acceptance_test.plugin",)
9+
10+
11+
@pytest.fixture(scope="session", autouse=True)
12+
def connector_setup():
13+
"""This fixture is a placeholder for external resources that acceptance test might require."""
14+
# TODO: setup test dependencies if needed. otherwise remove the TODO comments
15+
yield
16+
# TODO: clean up test dependencies
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"streams": [
3+
{
4+
"stream": {
5+
"name": "per-article",
6+
"json_schema": {},
7+
"supported_sync_modes": ["full_refresh"]
8+
},
9+
"sync_mode": "full_refresh",
10+
"destination_sync_mode": "overwrite"
11+
},
12+
{
13+
"stream": {
14+
"name": "top",
15+
"json_schema": {},
16+
"supported_sync_modes": ["full_refresh"]
17+
},
18+
"sync_mode": "full_refresh",
19+
"destination_sync_mode": "overwrite"
20+
}
21+
]
22+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"project": "en.wikipedia.org",
3+
"access": "all-access",
4+
"agent": "all-agents",
5+
"article": "Are_You_the_One%3F",
6+
"start": "",
7+
"end": "202440101",
8+
"country": "EN"
9+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"project": "en.wikipedia.org",
3+
"access": "all-access",
4+
"agent": "all-agents",
5+
"article": "Are_You_the_One%3F",
6+
"start": "20200101",
7+
"end": "20200201",
8+
"country": "EN"
9+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"per-article": {
3+
"timestamp": "20210505"
4+
}
5+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#
2+
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3+
#
4+
5+
6+
import sys
7+
8+
from airbyte_cdk.entrypoint import launch
9+
from source_wikipedia_pageviews import SourceWikipediaPageviews
10+
11+
if __name__ == "__main__":
12+
source = SourceWikipediaPageviews()
13+
launch(source, sys.argv[1:])
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
-e ../../bases/source-acceptance-test
2+
-e .

0 commit comments

Comments
 (0)