Skip to content

Commit 81a75e4

Browse files
committed
Chore: adding tests to service
1 parent d07401d commit 81a75e4

File tree

8 files changed

+433
-26
lines changed

8 files changed

+433
-26
lines changed

README.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
1-
# Go Webcrawler
2-
1+
# Go Webcrawler 🕷️
32
[![Go](https://github.com/Acollie/Go-Webcrawler/actions/workflows/go.yml/badge.svg)](https://github.com/Acollie/Go-Webcrawler/actions/workflows/go.yml)
43

54
![Example of a graph](/assets/example.png "Example of a graph")
6-
## Overview
5+
6+
## Overview 🌐
77
![Overview of architecture](/assets/overview.png "Overview")
88

9-
### SQS Queue
9+
### SQS Queue 📥
1010

1111
The SQS queue is used to store the URLs that need to be crawled.
1212

13-
### DynamoDB
13+
### DynamoDB 📦
1414

1515
DynamoDB is used to store the URLs that have been crawled and the URLs that have been found on the page.
1616

17-
### Neptune / Neo4k
17+
### Neptune / Neo4k 🌌
1818

1919
Neptune is used to store the relationships between the URLs that have been found on the page.
2020

go.mod

+53-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
module webcrawler
22

3-
go 1.21
3+
go 1.22
4+
5+
toolchain go1.23.3
46

57
require (
68
github.com/anaskhan96/soup v1.2.5
@@ -12,12 +14,16 @@ require (
1214
github.com/joho/godotenv v1.5.1
1315
github.com/lib/pq v1.10.9
1416
github.com/neo4j/neo4j-go-driver/v5 v5.20.0
15-
github.com/stretchr/testify v1.6.1
17+
github.com/stretchr/testify v1.9.0
1618
github.com/temoto/robotstxt v1.1.2
17-
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c
19+
github.com/testcontainers/testcontainers-go/modules/neo4j v0.35.0
20+
gopkg.in/yaml.v3 v3.0.1
1821
)
1922

2023
require (
24+
dario.cat/mergo v1.0.0 // indirect
25+
github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect
26+
github.com/Microsoft/go-winio v0.6.2 // indirect
2127
github.com/aws/aws-sdk-go-v2/credentials v1.16.13 // indirect
2228
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.14.10 // indirect
2329
github.com/aws/aws-sdk-go-v2/internal/configsources v1.2.10 // indirect
@@ -31,9 +37,50 @@ require (
3137
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.21.5 // indirect
3238
github.com/aws/aws-sdk-go-v2/service/sts v1.26.6 // indirect
3339
github.com/aws/smithy-go v1.19.0 // indirect
34-
github.com/davecgh/go-spew v1.1.0 // indirect
40+
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
41+
github.com/containerd/containerd v1.7.18 // indirect
42+
github.com/containerd/log v0.1.0 // indirect
43+
github.com/containerd/platforms v0.2.1 // indirect
44+
github.com/cpuguy83/dockercfg v0.3.2 // indirect
45+
github.com/davecgh/go-spew v1.1.1 // indirect
46+
github.com/distribution/reference v0.6.0 // indirect
47+
github.com/docker/docker v27.1.1+incompatible // indirect
48+
github.com/docker/go-connections v0.5.0 // indirect
49+
github.com/docker/go-units v0.5.0 // indirect
50+
github.com/felixge/httpsnoop v1.0.4 // indirect
51+
github.com/go-logr/logr v1.4.1 // indirect
52+
github.com/go-logr/stdr v1.2.2 // indirect
53+
github.com/go-ole/go-ole v1.2.6 // indirect
54+
github.com/gogo/protobuf v1.3.2 // indirect
55+
github.com/google/uuid v1.6.0 // indirect
3556
github.com/jmespath/go-jmespath v0.4.0 // indirect
57+
github.com/klauspost/compress v1.17.4 // indirect
58+
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
59+
github.com/magiconair/properties v1.8.7 // indirect
60+
github.com/moby/docker-image-spec v1.3.1 // indirect
61+
github.com/moby/patternmatcher v0.6.0 // indirect
62+
github.com/moby/sys/sequential v0.5.0 // indirect
63+
github.com/moby/sys/user v0.1.0 // indirect
64+
github.com/moby/term v0.5.0 // indirect
65+
github.com/morikuni/aec v1.0.0 // indirect
66+
github.com/opencontainers/go-digest v1.0.0 // indirect
67+
github.com/opencontainers/image-spec v1.1.0 // indirect
68+
github.com/pkg/errors v0.9.1 // indirect
3669
github.com/pmezard/go-difflib v1.0.0 // indirect
37-
golang.org/x/net v0.17.0 // indirect
38-
golang.org/x/text v0.13.0 // indirect
70+
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
71+
github.com/shirou/gopsutil/v3 v3.23.12 // indirect
72+
github.com/shoenig/go-m1cpu v0.1.6 // indirect
73+
github.com/sirupsen/logrus v1.9.3 // indirect
74+
github.com/testcontainers/testcontainers-go v0.35.0 // indirect
75+
github.com/tklauser/go-sysconf v0.3.12 // indirect
76+
github.com/tklauser/numcpus v0.6.1 // indirect
77+
github.com/yusufpapurcu/wmi v1.2.3 // indirect
78+
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
79+
go.opentelemetry.io/otel v1.24.0 // indirect
80+
go.opentelemetry.io/otel/metric v1.24.0 // indirect
81+
go.opentelemetry.io/otel/trace v1.24.0 // indirect
82+
golang.org/x/crypto v0.31.0 // indirect
83+
golang.org/x/net v0.26.0 // indirect
84+
golang.org/x/sys v0.28.0 // indirect
85+
golang.org/x/text v0.21.0 // indirect
3986
)

go.sum

+187-11
Large diffs are not rendered by default.

graphx/add.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ func (g *Graph) AddWebsite(ctx context.Context, web site.Website) error {
1414
on create set w.prominence = 1
1515
on create set w.crawled_at = datetime()
1616
on match set w.prominence = coalesce(w.prominence, 0) + 1
17-
1817
`, map[string]interface{}{
1918
"url": web.Url,
2019
"prominenceValue": web.ProminenceValue,
@@ -26,7 +25,7 @@ on create set w.crawled_at = datetime()
2625
}
2726

2827
func (g *Graph) AddLink(ctx context.Context, page site.Page) error {
29-
ses := g.Neo4j.NewSession(ctx, neo4j.SessionConfig{})
28+
ses := g.Neo4j.NewSession(ctx, Neo4kStdSession)
3029
var err error
3130
for _, link := range page.Links {
3231
_, err = ses.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
@@ -54,7 +53,7 @@ func (g *Graph) AddLink(ctx context.Context, page site.Page) error {
5453
}
5554

5655
func (g *Graph) AddPage(ctx context.Context, web site.Website, page site.Page) error {
57-
ses := g.Neo4j.NewSession(ctx, neo4j.SessionConfig{})
56+
ses := g.Neo4j.NewSession(ctx, Neo4kStdSession)
5857
_, err := ses.ExecuteWrite(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
5958
_, err := tx.Run(ctx, `
6059
MERGE (page:Page {url:$url, title:$title})

graphx/conn.go

+36
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ package graphx
22

33
import (
44
"context"
5+
"fmt"
56
"github.com/neo4j/neo4j-go-driver/v5/neo4j"
7+
neo4jTest "github.com/testcontainers/testcontainers-go/modules/neo4j"
68
)
79

810
func Conn(ctx context.Context, user string, password string, url string) (neo4j.DriverWithContext, error) {
@@ -30,3 +32,37 @@ func New(graph neo4j.DriverWithContext) *Graph {
3032
}
3133

3234
}
35+
func NewTestContainer(ctx context.Context) (*Graph, error) {
36+
neo4jContainer, err := neo4jTest.Run(ctx,
37+
"neo4j:4.4",
38+
neo4jTest.WithoutAuthentication(),
39+
neo4jTest.WithLabsPlugin(neo4jTest.Apoc),
40+
)
41+
42+
if err != nil {
43+
return nil, fmt.Errorf("failed to run container: %w", err)
44+
}
45+
46+
boltURL, err := neo4jContainer.BoltUrl(ctx)
47+
if err != nil {
48+
return nil, fmt.Errorf("failed to get bolt url: %w", err)
49+
}
50+
51+
driver, err := neo4j.NewDriverWithContext(
52+
boltURL,
53+
neo4j.NoAuth(),
54+
)
55+
if err != nil {
56+
return nil, fmt.Errorf("failed to create driver: %w", err)
57+
}
58+
59+
err = driver.VerifyConnectivity(ctx)
60+
if err != nil {
61+
return nil, fmt.Errorf("failed to verify connectivity: %w", err)
62+
}
63+
64+
return &Graph{
65+
Neo4j: driver,
66+
}, nil
67+
68+
}

graphx/get.go

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package graphx
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"github.com/neo4j/neo4j-go-driver/v5/neo4j"
7+
"webcrawler/site"
8+
)
9+
10+
func (g *Graph) GetWebsite(ctx context.Context, rootWebsite string) (site.Website, error) {
11+
ses := g.Neo4j.NewSession(ctx, Neo4kStdSession)
12+
defer ses.Close(ctx)
13+
14+
result, err := ses.ExecuteRead(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
15+
rows, err := tx.Run(ctx, `
16+
MATCH (w:Website {url: $url})
17+
RETURN w.url AS websiteUrl, w.ProminenceValue AS prominenceValue
18+
`, map[string]any{
19+
"url": rootWebsite,
20+
})
21+
if err != nil {
22+
return nil, err
23+
}
24+
25+
if rows.Next(ctx) {
26+
rec := rows.Record()
27+
wUrl, _ := rec.Get("websiteUrl")
28+
promVal, _ := rec.Get("prominenceValue")
29+
return site.Website{
30+
Url: wUrl.(string),
31+
ProminenceValue: promVal.(float64),
32+
}, nil
33+
}
34+
return nil, fmt.Errorf("Website not found")
35+
})
36+
if err != nil {
37+
return site.Website{}, err
38+
}
39+
40+
return result.(site.Website), nil
41+
}
42+
43+
func (g *Graph) GetPage(ctx context.Context, pageUrl string) (site.Page, error) {
44+
ses := g.Neo4j.NewSession(ctx, Neo4kStdSession)
45+
defer ses.Close(ctx)
46+
47+
result, err := ses.ExecuteRead(ctx, func(tx neo4j.ManagedTransaction) (any, error) {
48+
rows, err := tx.Run(ctx, `
49+
MATCH (w:Page {url: $url})
50+
RETURN w.url AS websiteUrl, w.baseURL AS baseURL, w.title AS title
51+
`, map[string]any{"url": pageUrl})
52+
if err != nil {
53+
return nil, err
54+
}
55+
56+
if rows.Next(ctx) {
57+
rec := rows.Record()
58+
59+
wUrlVal, _ := rec.Get("websiteUrl")
60+
baseUrlVal, _ := rec.Get("baseURL")
61+
titleVal, _ := rec.Get("title")
62+
63+
if wUrlVal == nil {
64+
return nil, fmt.Errorf("missing url property")
65+
}
66+
67+
wUrlStr := wUrlVal.(string)
68+
var baseUrlStr string
69+
if baseUrlVal != nil {
70+
baseUrlStr = baseUrlVal.(string)
71+
}
72+
73+
titleStr := ""
74+
if titleVal != nil {
75+
titleStr = titleVal.(string)
76+
}
77+
78+
return site.Page{
79+
Url: wUrlStr,
80+
BaseURL: baseUrlStr,
81+
Title: titleStr,
82+
}, nil
83+
}
84+
return nil, fmt.Errorf("page not found")
85+
})
86+
if err != nil {
87+
return site.Page{}, err
88+
}
89+
90+
return result.(site.Page), nil
91+
}

graphx/graph_test.go

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package graphx
2+
3+
import (
4+
"context"
5+
"github.com/stretchr/testify/require"
6+
"testing"
7+
"webcrawler/site"
8+
)
9+
10+
func Test_conn(t *testing.T) {
11+
ctx := context.Background()
12+
graph, err := NewTestContainer(ctx)
13+
require.NoError(t, err)
14+
require.NotNil(t, graph)
15+
16+
t.Run("Add a page to the graph", func(t *testing.T) {
17+
err := graph.AddPage(ctx, site.Website{
18+
Url: "https:\\/\\/example.com",
19+
ProminenceValue: 5,
20+
}, site.Page{
21+
Url: "https:\\/\\/example.com\\/page",
22+
Title: "Example Page",
23+
})
24+
require.NoError(t, err)
25+
page, err := graph.GetPage(ctx, "https:\\/\\/example.com\\/page")
26+
require.NoError(t, err)
27+
require.Equal(t, "https:\\/\\/example.com\\/page", page.Url)
28+
require.Equal(t, "Example Page", page.Title)
29+
30+
})
31+
32+
t.Run("Add a website to the graph", func(t *testing.T) {
33+
err := graph.AddWebsite(ctx, site.Website{
34+
Url: "https:\\/\\/another-example.org",
35+
ProminenceValue: 3,
36+
})
37+
require.NoError(t, err)
38+
website, err := graph.GetWebsite(ctx, "https:\\/\\/another-example.org")
39+
require.NoError(t, err)
40+
require.Equal(t, "https:\\/\\/another-example.org", website.Url)
41+
require.Equal(t, 3.0, website.ProminenceValue)
42+
// \[Optionally verify stored data\]
43+
})
44+
t.Run("Add a link to the graph", func(t *testing.T) {
45+
err := graph.AddLink(ctx, site.Page{
46+
Url: "https:\\/\\/example.com\\/page",
47+
Links: []string{
48+
"https:\\/\\/example.com\\/page\\/link1",
49+
"https:\\/\\/example.com\\/page\\/link2",
50+
},
51+
})
52+
require.NoError(t, err)
53+
})
54+
}

graphx/type.go

+4
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,7 @@ import "github.com/neo4j/neo4j-go-driver/v5/neo4j"
55
type Graph struct {
66
Neo4j neo4j.DriverWithContext
77
}
8+
9+
var (
10+
Neo4kStdSession neo4j.SessionConfig = neo4j.SessionConfig{}
11+
)

0 commit comments

Comments
 (0)