Skip to content

Commit 54e5ea7

Browse files
authored
Update cralwer to crawl all io.opentelemetry.* groups (#7316)
1 parent d70fe5b commit 54e5ea7

File tree

3 files changed

+84
-38
lines changed

3 files changed

+84
-38
lines changed

javadoc-crawler/src/main/java/io/opentelemetry/javadocs/Artifact.java

+12-1
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,30 @@
66
package io.opentelemetry.javadocs;
77

88
public class Artifact {
9+
private final String group;
910
private final String name;
1011
private final String version;
1112

12-
public Artifact(String name, String version) {
13+
public Artifact(String group, String name, String version) {
14+
this.group = group;
1315
this.name = name;
1416
this.version = version;
1517
}
1618

19+
public String getGroup() {
20+
return group;
21+
}
22+
1723
public String getName() {
1824
return name;
1925
}
2026

2127
public String getVersion() {
2228
return version;
2329
}
30+
31+
@Override
32+
public String toString() {
33+
return group + ":" + name + ":" + version;
34+
}
2435
}

javadoc-crawler/src/main/java/io/opentelemetry/javadocs/JavaDocsCrawler.java

+65-28
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
import java.util.List;
1616
import java.util.Locale;
1717
import java.util.Map;
18+
import java.util.Objects;
1819
import java.util.Optional;
1920
import java.util.logging.Level;
2021
import java.util.logging.Logger;
22+
import java.util.stream.Collectors;
2123

2224
/**
2325
* The javadoc.io site relies on someone accessing the page for an artifact version in order to
@@ -26,7 +28,16 @@
2628
* pages on the javadoc.io site to trigger updates.
2729
*/
2830
public final class JavaDocsCrawler {
29-
private static final String GROUP = "io.opentelemetry";
31+
// Track list of groups and the minimum artifact versions that should be crawled. Update to the
32+
// latest periodically to avoid crawling artifacts that stopped being published.
33+
private static final Map<String, String> GROUPS_AND_MIN_VERSION =
34+
Map.of(
35+
"io.opentelemetry", "1.49.0",
36+
"io.opentelemetry.instrumentation", "2.15.0",
37+
"io.opentelemetry.contrib", "1.46.0",
38+
"io.opentelemetry.semconv", "1.32.0",
39+
"io.opentelemetry.proto", "1.3.2");
40+
3041
private static final String MAVEN_CENTRAL_BASE_URL =
3142
"https://search.maven.org/solrsearch/select?q=g:";
3243
private static final String JAVA_DOCS_BASE_URL = "https://javadoc.io/doc/";
@@ -41,23 +52,34 @@ public final class JavaDocsCrawler {
4152

4253
public static void main(String[] args) throws Exception {
4354
HttpClient client = HttpClient.newHttpClient();
44-
List<Artifact> artifacts = getArtifacts(client);
45-
if (artifacts.isEmpty()) {
46-
logger.log(Level.SEVERE, "No artifacts found");
47-
return;
48-
}
49-
logger.info(String.format(Locale.ROOT, "Found %d artifacts", artifacts.size()));
5055

51-
List<String> updated = crawlJavaDocs(client, artifacts);
52-
if (updated.isEmpty()) {
53-
logger.info("No updates were needed");
54-
return;
55-
}
56+
for (Map.Entry<String, String> groupAndMinVersion : GROUPS_AND_MIN_VERSION.entrySet()) {
57+
String group = groupAndMinVersion.getKey();
58+
59+
List<Artifact> artifacts = getArtifacts(client, group);
60+
if (artifacts.isEmpty()) {
61+
logger.log(Level.SEVERE, "No artifacts found for group " + group);
62+
continue;
63+
}
64+
logger.info(
65+
String.format(Locale.ROOT, "Found %d artifacts for group " + group, artifacts.size()));
5666

57-
logger.info("Artifacts that triggered updates:\n" + String.join("\n", updated));
67+
List<Artifact> updated = crawlJavaDocs(client, groupAndMinVersion.getValue(), artifacts);
68+
if (updated.isEmpty()) {
69+
logger.info("No updates were needed for group " + group);
70+
continue;
71+
}
72+
73+
logger.info(
74+
"Artifacts that triggered updates for group "
75+
+ group
76+
+ ":\n"
77+
+ updated.stream().map(Artifact::toString).collect(Collectors.joining("\n")));
78+
}
5879
}
5980

60-
static List<Artifact> getArtifacts(HttpClient client) throws IOException, InterruptedException {
81+
static List<Artifact> getArtifacts(HttpClient client, String group)
82+
throws IOException, InterruptedException {
6183
int start = 0;
6284
Integer numFound;
6385
List<Artifact> result = new ArrayList<>();
@@ -67,7 +89,7 @@ static List<Artifact> getArtifacts(HttpClient client) throws IOException, Interr
6789
Thread.sleep(THROTTLE_MS); // try not to DDoS the site, it gets knocked over easily
6890
}
6991

70-
Map<?, ?> map = queryMavenCentral(client, start);
92+
Map<?, ?> map = queryMavenCentral(client, group, start);
7193

7294
numFound =
7395
Optional.ofNullable(map)
@@ -93,26 +115,26 @@ private static List<Artifact> convertToArtifacts(Map<?, ?> map) {
93115
List<Artifact> artifacts = new ArrayList<>();
94116
for (Object doc : docs) {
95117
Map<?, ?> docMap = (Map<?, ?>) doc;
96-
String artifact = (String) docMap.get("a");
97-
String version = (String) docMap.get("latestVersion");
98-
if (artifact != null && version != null) {
99-
artifacts.add(new Artifact(artifact, version));
100-
}
118+
String group = Objects.requireNonNull((String) docMap.get("g"), "g");
119+
String artifact = Objects.requireNonNull((String) docMap.get("a"), "a");
120+
String version =
121+
Objects.requireNonNull((String) docMap.get("latestVersion"), "latestVersion");
122+
artifacts.add(new Artifact(Objects.requireNonNull(group), artifact, version));
101123
}
102124
return artifacts;
103125
})
104126
.orElseGet(ArrayList::new);
105127
}
106128

107-
private static Map<?, ?> queryMavenCentral(HttpClient client, int start)
129+
private static Map<?, ?> queryMavenCentral(HttpClient client, String group, int start)
108130
throws IOException, InterruptedException {
109131
URI uri =
110132
URI.create(
111133
String.format(
112134
Locale.ROOT,
113135
"%s%s&rows=%d&start=%d&wt=json",
114136
MAVEN_CENTRAL_BASE_URL,
115-
GROUP,
137+
group,
116138
PAGE_SIZE,
117139
start));
118140

@@ -122,21 +144,35 @@ private static List<Artifact> convertToArtifacts(Map<?, ?> map) {
122144
if (response.statusCode() != 200) {
123145
logger.log(
124146
Level.SEVERE,
125-
"Unexpected response code: " + response.statusCode() + ": " + response.body());
147+
"Unexpected response code "
148+
+ response.statusCode()
149+
+ " for uri: "
150+
+ uri.toASCIIString()
151+
+ "\n"
152+
+ response.body());
126153
throw new IOException("Unable to pull Maven central artifacts list");
127154
}
128155
return objectMapper.readValue(response.body(), Map.class);
129156
}
130157

131-
static List<String> crawlJavaDocs(HttpClient client, List<Artifact> artifacts)
158+
static List<Artifact> crawlJavaDocs(
159+
HttpClient client, String minVersion, List<Artifact> artifacts)
132160
throws IOException, InterruptedException {
133-
List<String> updatedArtifacts = new ArrayList<>();
161+
List<Artifact> updatedArtifacts = new ArrayList<>();
134162

135163
for (Artifact artifact : artifacts) {
164+
if (artifact.getVersion().compareTo(minVersion) < 0) {
165+
logger.info(
166+
String.format(
167+
"Skipping crawling %s due to version %s being less than minVersion %s",
168+
artifact, artifact.getVersion(), minVersion));
169+
continue;
170+
}
171+
136172
String[] parts = artifact.getName().split("-");
137173
StringBuilder path = new StringBuilder();
138174
path.append(JAVA_DOCS_BASE_URL)
139-
.append(GROUP)
175+
.append(artifact.getGroup())
140176
.append("/")
141177
.append(artifact.getName())
142178
.append("/")
@@ -146,6 +182,7 @@ static List<String> crawlJavaDocs(HttpClient client, List<Artifact> artifacts)
146182
.append("/package-summary.html");
147183

148184
HttpRequest crawlRequest = HttpRequest.newBuilder(URI.create(path.toString())).GET().build();
185+
logger.info(String.format("Crawling %s at: %s", artifact, path));
149186
HttpResponse<String> crawlResponse =
150187
client.send(crawlRequest, HttpResponse.BodyHandlers.ofString());
151188

@@ -156,15 +193,15 @@ static List<String> crawlJavaDocs(HttpClient client, List<Artifact> artifacts)
156193
String.format(
157194
Locale.ROOT,
158195
"Crawl failed for %s with status code %d at URL %s\nResponse: %s",
159-
artifact.getName(),
196+
artifact,
160197
crawlResponse.statusCode(),
161198
path,
162199
crawlResponse.body()));
163200
continue;
164201
}
165202

166203
if (crawlResponse.body().contains(JAVA_DOC_DOWNLOADED_TEXT)) {
167-
updatedArtifacts.add(artifact.getName());
204+
updatedArtifacts.add(artifact);
168205
}
169206

170207
Thread.sleep(THROTTLE_MS); // some light throttling

javadoc-crawler/src/test/java/io/opentelemetry/javadocs/JavaDocsCrawlerTest.java

+7-9
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import java.net.http.HttpClient;
1717
import java.net.http.HttpRequest;
1818
import java.net.http.HttpResponse;
19-
import java.util.ArrayList;
2019
import java.util.List;
2120
import org.junit.jupiter.api.Test;
2221
import org.junit.jupiter.api.extension.ExtendWith;
@@ -39,8 +38,8 @@ void testGetArtifactsHandlesPagination() throws IOException, InterruptedExceptio
3938
"response": {
4039
"numFound": 40,
4140
"docs": [
42-
{"a": "artifact1", "latestVersion": "1.0"},
43-
{"a": "artifact2", "latestVersion": "1.1"}
41+
{"g": "group", "a": "artifact1", "latestVersion": "1.0"},
42+
{"g": "group", "a": "artifact2", "latestVersion": "1.1"}
4443
]
4544
}
4645
}
@@ -51,7 +50,7 @@ void testGetArtifactsHandlesPagination() throws IOException, InterruptedExceptio
5150
"response": {
5251
"numFound": 40,
5352
"docs": [
54-
{"a": "artifact3", "latestVersion": "2.0"}
53+
{"g": "group", "a": "artifact3", "latestVersion": "2.0"}
5554
]
5655
}
5756
}
@@ -66,7 +65,7 @@ void testGetArtifactsHandlesPagination() throws IOException, InterruptedExceptio
6665
.thenReturn(mockMavenCentralRequest1)
6766
.thenReturn(mockMavenCentralRequest2);
6867

69-
List<Artifact> artifacts = JavaDocsCrawler.getArtifacts(mockClient);
68+
List<Artifact> artifacts = JavaDocsCrawler.getArtifacts(mockClient, "io.opentelemetry");
7069

7170
// 2 calls for the pagination
7271
verify(mockClient, times(2)).send(any(), any());
@@ -75,22 +74,21 @@ void testGetArtifactsHandlesPagination() throws IOException, InterruptedExceptio
7574

7675
@Test
7776
void testCrawler() throws IOException, InterruptedException {
78-
List<Artifact> artifacts = new ArrayList<>();
79-
artifacts.add(new Artifact("opentelemetry-context", "1.49.0"));
77+
Artifact artifact = new Artifact("io.opentelemetry", "opentelemetry-context", "1.49.0");
8078
ArgumentCaptor<HttpRequest> requestCaptor = ArgumentCaptor.forClass(HttpRequest.class);
8179

8280
when(mockJavaDocResponse.body()).thenReturn(JAVA_DOC_DOWNLOADED_TEXT);
8381
when(mockJavaDocResponse.statusCode()).thenReturn(200);
8482

8583
when(mockClient.send(any(), any())).thenReturn(mockJavaDocResponse);
8684

87-
List<String> updated = JavaDocsCrawler.crawlJavaDocs(mockClient, artifacts);
85+
List<Artifact> updated = JavaDocsCrawler.crawlJavaDocs(mockClient, "1.49.0", List.of(artifact));
8886

8987
verify(mockClient, times(1)).send(requestCaptor.capture(), any());
9088

9189
assertThat(requestCaptor.getValue().uri().toString())
9290
.isEqualTo(
9391
"https://javadoc.io/doc/io.opentelemetry/opentelemetry-context/1.49.0/opentelemetry/context/package-summary.html");
94-
assertThat(updated).containsExactly("opentelemetry-context");
92+
assertThat(updated).containsExactly(artifact);
9593
}
9694
}

0 commit comments

Comments
 (0)