Skip to content

Commit fe334de

Browse files
authored
Merge pull request #2667 from sciencehistory/cloudfront_over_s3
Support Cloudfront distros in front of select S3 buckets
2 parents 7eea8d0 + c000046 commit fe334de

File tree

6 files changed

+355
-25
lines changed

6 files changed

+355
-25
lines changed

Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ gem "aws-sdk-cloudwatchevents", "~> 1.0"
6666
gem "aws-sdk-cloudwatchlogs", "~> 1.0"
6767
gem "aws-sdk-mediaconvert", "~> 1.0"
6868
gem "aws-sdk-s3", "~> 1.0"
69+
gem "aws-sdk-cloudfront", "~> 1.91"
6970

7071
# Use postgresql as the database for Active Record
7172
gem 'pg', '>= 0.18', '< 2.0'

Gemfile.lock

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ GEM
9797
execjs (~> 2)
9898
aws-eventstream (1.3.0)
9999
aws-partitions (1.939.0)
100+
aws-sdk-cloudfront (1.91.0)
101+
aws-sdk-core (~> 3, >= 3.193.0)
102+
aws-sigv4 (~> 1.1)
100103
aws-sdk-cloudwatchevents (1.72.0)
101104
aws-sdk-core (~> 3, >= 3.193.0)
102105
aws-sigv4 (~> 1.1)
@@ -768,6 +771,7 @@ DEPENDENCIES
768771
activerecord-postgres_enum (~> 2.0)
769772
alba (~> 3.1)
770773
attr_json (~> 2.3)
774+
aws-sdk-cloudfront (~> 1.91)
771775
aws-sdk-cloudwatchevents (~> 1.0)
772776
aws-sdk-cloudwatchlogs (~> 1.0)
773777
aws-sdk-mediaconvert (~> 1.0)

lib/scihist_digicoll/env.rb

Lines changed: 83 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
require "shrine/storage/file_system"
33
require "shrine/storage/s3"
44
require 'faster_s3_url/shrine/storage'
5+
require 'scihist_digicoll/shrine_storage/cloudfront_s3_storage'
56

67

78
module ScihistDigicoll
@@ -236,15 +237,37 @@ def self.persistent_redis_connection!
236237
define_key :s3_bucket_derivatives
237238
define_key :s3_bucket_derivatives_video
238239

239-
# Note: the values for :s3_bucket_derivatives_host and :s3_bucket_derivatives_video_host
240-
# can be obtained by running `terraform output`.
241-
define_key :s3_bucket_derivatives_host # Cloudfront hostname for regular derivs bucket
242-
define_key :s3_bucket_derivatives_video_host # Ditto, for video derivs bucket
243-
244240
define_key :s3_bucket_uploads
245241
define_key :s3_bucket_on_demand_derivatives
246242
define_key :s3_bucket_dzi
247243

244+
# if we are using CloudFront in front of a bucket, we set Cloudfront Distro hostname
245+
# in variables corresponding to bucket name env, with `_host` on the end. These
246+
# are all assumed cloudfront if set.
247+
#
248+
# All of these cloudfront hostnames are somewhat opaque, and can be found
249+
# from `terraform output` output from existing infrastructure, either staging
250+
# or prod.
251+
252+
define_key :s3_bucket_originals_host
253+
define_key :s3_bucket_originals_video_host # not currently used as we don't provide access to originals
254+
define_key :s3_bucket_derivatives_host
255+
define_key :s3_bucket_derivatives_video_host
256+
define_key :s3_bucket_on_demand_derivatives_host
257+
define_key :s3_bucket_dzi_host
258+
259+
# If we are using Cloudfront with restricted buckets, the key-pair id and RSA public
260+
# key need to be set.
261+
define_key :cloudfront_key_pair_id # from `terraform output`
262+
# private key can be found in 1password as:
263+
# `scihist-digicoll-production_private_key.pem` or
264+
# `scihist-digicoll-staging_private_key.pem`
265+
#
266+
# Value needs to be exported from 1Password in `PKCS#8` format, will begin with
267+
# '-----BEGIN PRIVATE KEY-----' [NOT 'OPENSSH PRIVATE KEY']
268+
define_key :cloudfront_private_key
269+
270+
248271
define_key :ingest_bucket, default: -> {
249272
if !Rails.env.production?
250273
# This bucket isn't actually mounted on workstations, but you can
@@ -361,13 +384,18 @@ def self.solr_collection_name
361384
# @param bucket_key [String] required. in `production` mode this is the bucket name, otherwise
362385
# it becomes part of the prefix location.
363386
#
387+
# @param public [Boolean] (default false), if FALSE the storage requires signed URLs, and
388+
# we will set up shrine storage to generate them -- either using
389+
# AWS access key (direct), or Cloudfront public key (if host is set for
390+
# Cloudfront distro in front)
391+
#
364392
# @param prefix [String] prefix passed to shrine storage, a "directory" within the
365393
# storage. for dev_s3 and dev_file modes combined with
366394
# other pre-prefix.
367395
#
368-
# @param host [String] used only for `production` S3 mode, passed to Shrine storage as
369-
# 'host' param, used for cloudfront CDN and/or other CNAME, alternate
370-
# host used to access S3 bucket.
396+
# @param host [String] used only for `production` S3 mode, if set we assume a CloudFront CDN distro
397+
# on top of bucket, and set up url-generation accordingly, including with
398+
# Cloudfront-style signing (using env for cloudfront public key which will be required)
371399
#
372400
# @param s3_storage_options [Hash] passed directly to Shrine storage for S3 modes, additional
373401
# arbitrary options. Can override other defaults or params.
@@ -377,7 +405,7 @@ def self.solr_collection_name
377405
# file system). Normally left unset, it will default to
378406
# env key :storage_mode, which is what you want it to do.
379407
#
380-
def self.appropriate_shrine_storage(bucket_key:, mode: lookup!(:storage_mode), prefix: nil,
408+
def self.appropriate_shrine_storage(bucket_key:, public: false, mode: lookup!(:storage_mode), prefix: nil,
381409
host: nil, s3_storage_options: {} )
382410
unless %I{s3_bucket_uploads s3_bucket_originals s3_bucket_originals_video s3_bucket_derivatives
383411
s3_bucket_derivatives_video
@@ -400,14 +428,29 @@ def self.appropriate_shrine_storage(bucket_key:, mode: lookup!(:storage_mode), p
400428
region: lookup(:aws_region)
401429
}.merge(s3_storage_options))
402430
elsif mode == "production"
403-
FasterS3Url::Shrine::Storage.new(**{
404-
bucket: lookup!(bucket_key),
405-
host: host,
406-
prefix: prefix,
407-
access_key_id: lookup!(:aws_access_key_id),
408-
secret_access_key: lookup!(:aws_secret_access_key),
409-
region: lookup!(:aws_region)
410-
}.merge(s3_storage_options))
431+
if host.present?
432+
# Assumed cloudfront if we have a host!
433+
ScihistDigicoll::ShrineStorage::CloudfrontS3Storage.new(**{
434+
bucket: lookup!(bucket_key),
435+
host: host,
436+
public: public,
437+
prefix: prefix,
438+
access_key_id: lookup!(:aws_access_key_id),
439+
secret_access_key: lookup!(:aws_secret_access_key),
440+
region: lookup!(:aws_region),
441+
cloudfront_key_pair_id: lookup(:cloudfront_key_pair_id),
442+
cloudfront_private_key: lookup(:cloudfront_private_key)
443+
}.merge(s3_storage_options))
444+
else
445+
FasterS3Url::Shrine::Storage.new(**{
446+
bucket: lookup!(bucket_key),
447+
public: public,
448+
prefix: prefix,
449+
access_key_id: lookup!(:aws_access_key_id),
450+
secret_access_key: lookup!(:aws_secret_access_key),
451+
region: lookup!(:aws_region)
452+
}.merge(s3_storage_options))
453+
end
411454
else
412455
raise TypeError.new("unrecognized storage mode: #{mode}")
413456
end
@@ -423,22 +466,22 @@ def self.shrine_cache_storage
423466

424467
def self.shrine_store_storage
425468
@shrine_store_storage ||=
426-
appropriate_shrine_storage(bucket_key: :s3_bucket_originals)
469+
appropriate_shrine_storage(bucket_key: :s3_bucket_originals, public: false, host: lookup(:s3_bucket_originals_host))
427470
end
428471

429472
# we store video originals in separate location
430473
def self.shrine_store_video_storage
431474
@shrine_video_store_storage ||=
432-
appropriate_shrine_storage(bucket_key: :s3_bucket_originals_video)
475+
appropriate_shrine_storage(bucket_key: :s3_bucket_originals_video, public: false, host: lookup(:s3_bucket_originals_video_host))
433476
end
434477

435478
# Note we set shrine S3 storage to public, to upload with public ACLs
436479
def self.shrine_derivatives_storage
437480
@shrine_derivatives_storage ||=
438481
appropriate_shrine_storage( bucket_key: :s3_bucket_derivatives,
439482
host: lookup(:s3_bucket_derivatives_host),
483+
public: true,
440484
s3_storage_options: {
441-
public: true,
442485
upload_options: {
443486
# derivatives are public and at unique random URLs, so
444487
# can be cached far-future
@@ -451,8 +494,8 @@ def self.shrine_video_derivatives_storage
451494
@shrine_derivatives_video_storage ||=
452495
appropriate_shrine_storage( bucket_key: :s3_bucket_derivatives_video,
453496
host: lookup(:s3_bucket_derivatives_video_host),
497+
public: true,
454498
s3_storage_options: {
455-
public: true,
456499
upload_options: {
457500
# derivatives are public and at unique random URLs, so
458501
# can be cached far-future
@@ -467,16 +510,25 @@ def self.shrine_video_derivatives_storage
467510
def self.shrine_restricted_derivatives_storage
468511
@shrine_restricted_derivatives_storage ||=
469512
appropriate_shrine_storage( bucket_key: :s3_bucket_originals,
513+
host: lookup(:s3_bucket_originals_host),
514+
public: false,
470515
prefix: "restricted_derivatives")
471516
end
472517

473518
# Note we set shrine S3 storage to public, to upload with public ACLs
474519
def self.shrine_on_demand_derivatives_storage
475520
@shrine_on_demand_derivatives_storage ||=
476521
appropriate_shrine_storage( bucket_key: :s3_bucket_on_demand_derivatives,
522+
host: lookup(:s3_bucket_on_demand_derivatives_host),
523+
public: true,
477524
s3_storage_options: {
478-
public: true
479-
})
525+
upload_options: {
526+
# these have fingerprints in their URLs, so they are
527+
# cacheable forever. only started setting this in Jul 2024
528+
cache_control: "max-age=31536000, public"
529+
}
530+
}
531+
)
480532
end
481533

482534
def self.shrine_combined_audio_derivatives_storage
@@ -485,16 +537,22 @@ def self.shrine_combined_audio_derivatives_storage
485537
appropriate_shrine_storage( bucket_key: :s3_bucket_derivatives,
486538
host: lookup(:s3_bucket_derivatives_host),
487539
prefix: "combined_audio_derivatives",
540+
public: true,
488541
s3_storage_options: {
489-
public: true
542+
upload_options: {
543+
# these have fingerprints in their URLs, so they are
544+
# cacheable forever. only started setting this in Jul 2024
545+
cache_control: "max-age=31536000, public"
546+
}
490547
})
491548
end
492549

493550
def self.shrine_dzi_storage
494551
@shrine_dzi_storage ||=
495552
appropriate_shrine_storage( bucket_key: :s3_bucket_dzi,
553+
host: lookup(:s3_bucket_dzi_host),
554+
public: true,
496555
s3_storage_options: {
497-
public: true,
498556
upload_options: {
499557
# our DZI's are all public right now, and at unique-to-content
500558
# URLs, cache forever.
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
require "shrine/storage/s3"
2+
require 'faster_s3_url/shrine/storage'
3+
require "aws-sdk-cloudfront"
4+
5+
6+
# A shrine storage for use with AWS S3, and sub-classing Shrine's S3 storage -- but will generate
7+
# access URLs assuming CloudFront CDN in front of S3, according to our conventions.
8+
#
9+
# A `host` is required on initialization -- the cloudfront distro hostname
10+
# `public` is set on initialization, and can't be over-ridden in #url, if not public then CloudFront signing will be used
11+
#
12+
# Cloudfront key id and public key are generally supplied from Env.
13+
#
14+
# https://shrinerb.com/docs/storage/s3
15+
#
16+
# For the clodufront_key_pair_id and cloudfront_private_key, see https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/private-content-trusted-signers.html#private-content-creating-cloudfront-key-pairs
17+
#
18+
# For general write-up of how we've set things up, see https://bibwild.wordpress.com/2024/06/18/cloudfront-in-front-of-s3-using-response-content-disposition/
19+
module ScihistDigicoll
20+
module ShrineStorage
21+
class CloudfrontS3Storage < ::Shrine::Storage::S3
22+
attr_reader :host, :public_mode, :cloudfront_signer, :key_pair_id
23+
24+
QUERY_PARAMS_PROXIED = {
25+
response_cache_control: "response-cache-control",
26+
response_content_disposition: "response-content-disposition",
27+
response_content_encoding: "response-content-encoding",
28+
response_content_language: "response-content-language",
29+
response_content_type: "response-content-type"
30+
}.freeze
31+
32+
DEFAULT_EXPIRES_IN = 1.day
33+
34+
# generally any option tht Shrine::Storage::S3 respects can be used, and are passed through,
35+
# except `signer` is not supported and will error -- we always use CloudFront signing if public: false
36+
#
37+
# @param host [String] set only in initializer, the Cloudfront distro host
38+
#
39+
# @parma public [Boolean] (default true), if false, then Cloudfront signing will be done to urls, can not be
40+
# over-ridden in #url, since the way we set up CloudFront distros they either require signing of all urls or none.
41+
#
42+
# @param cloudfront_key_pair_kid [String] required if public:false, the cloudfront_key_pair id used for signing
43+
#
44+
# @param cloudfront_private_key [String] required if public:false, RSA publikc key corresopnding to cloudfront_key_pair_id
45+
#
46+
# NOTE: You will still need keys `access_key_id:` and `secret_access_key:` (or otherwise have AWS credentials
47+
# auto-discoverable), because bucket edit operations etc require them!
48+
#
49+
def initialize(host:, public: true, cloudfront_key_pair_id: nil, cloudfront_private_key: nil, **options)
50+
if options[:signer]
51+
raise ArgumentError.new("#{self.class.name} does not support :signer option of Shrine::Storage::S3.")
52+
end
53+
54+
@public_mode = !!public
55+
@host = host
56+
57+
@public_builder = FasterS3Url::Shrine::Storage.new(public: true, host: host, **options)
58+
59+
if !public_mode
60+
@cloudfront_signer = Aws::CloudFront::UrlSigner.new(
61+
key_pair_id: (cloudfront_key_pair_id || raise(ArgumentError.new("option `cloudfront_key_pair_id:` is required when `public:` is false"))),
62+
private_key: (cloudfront_private_key || raise(ArgumentError.new("option `cloudfront_private_key:` is required when `public:` is false")))
63+
)
64+
end
65+
66+
# We always tell the underlying shrine s3 storage we are NOt public, becuase we assume
67+
# the underlying S3 bucket is not public, in case it matters, we need to be signing requests
68+
# etc.
69+
super(public: false, **options)
70+
end
71+
72+
# unlike base Shrine::Storage::S3, does not support `host` here, do it in
73+
# initializer instead.
74+
#
75+
# # We IGNORE `public` option here -- our Cloudfront is either public or does not support,
76+
# public, no way to change per-url. But we ignore rather than raise, to allow
77+
# swap-in compatibility with code expecting to send it for normal S3.
78+
#
79+
# Unlike Shrine::Storage::S3, recognized S3 options (AWS ruby SDK style)
80+
# *are* passed on in public mode too, becuase in some cases we have set
81+
# up cloudfront to proxy them.
82+
#
83+
# Otherwise, same options as Shrine::S3::Storage should be supported, please
84+
# see docs there. https://shrinerb.com/docs/storage/s3
85+
def url(id, **options)
86+
if options[:host]
87+
raise ArgumentError.new("#{self.class.name}#url does not support :host option of Shrine::Storage::S3. You can only set host in initializer")
88+
end
89+
90+
if public_mode
91+
public_url(id, **options)
92+
else
93+
signed_url(id, **options)
94+
end
95+
end
96+
97+
def public_url(key, **options)
98+
"#{@public_builder.url(key)}#{query_param_serialized(options)}"
99+
end
100+
101+
def signed_url(key, expires_in: DEFAULT_EXPIRES_IN, **options)
102+
expires = options[:expires]&.to_i || (Time.now.utc.to_i + expires_in.to_i)
103+
104+
cloudfront_signer.signed_url(public_url(key, **options),
105+
expires: expires,
106+
**options
107+
)
108+
end
109+
110+
protected
111+
112+
def query_param_serialized(options)
113+
return nil if options.blank?
114+
115+
params = options.collect do |key, value|
116+
[QUERY_PARAMS_PROXIED[key], value] if QUERY_PARAMS_PROXIED.has_key?(key)
117+
end.compact
118+
119+
return nil if params.blank?
120+
121+
"?#{params.to_h.compact.to_param}"
122+
end
123+
end
124+
end
125+
end

0 commit comments

Comments
 (0)