Skip to content

Commit c187993

Browse files
authored
Merge pull request #1 from aws-samples/feature/initial
Feature/initial
2 parents 30542aa + ea46d8c commit c187993

File tree

16 files changed

+526
-6
lines changed

16 files changed

+526
-6
lines changed

.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
*.swp
2+
package-lock.json
3+
__pycache__
4+
.pytest_cache
5+
.venv
6+
.env
7+
*.egg-info
8+
.DS_Store
9+
10+
# CDK asset staging directory
11+
.cdk.staging
12+
cdk.out

README.md

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
1-
## My Project
1+
## Simplify and Optimize Python Package Management for AWS Glue PySpark jobs with AWS CodeArtifact
22

3-
TODO: Fill this README out!
3+
The artifacts in this repository support the published blog: Simplify and Optimize Python Package Management for AWS Glue PySpark jobs with AWS CodeArtifact. Refer to the blog for detailed instructions on setup and configuration.
44

5-
Be sure to:
6-
7-
* Change the title in this README
8-
* Edit your repository description on GitHub
95

106
## Security
117

app.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env python3
2+
import os
3+
4+
from aws_cdk import (
5+
Stack,
6+
App,
7+
Environment
8+
)
9+
from application.application_stack import ApplicationStack
10+
11+
app = App()
12+
ApplicationStack(app, "ApplicationStack",
13+
cidr_block='192.168.50.0/24')
14+
15+
app.synth()
16+

application/__init__.py

Whitespace-only changes.

application/application_stack.py

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
from aws_cdk import (
2+
aws_ec2 as ec2,
3+
aws_ssm as ssm,
4+
aws_codeartifact as codeartifact,
5+
aws_stepfunctions as sfn,
6+
aws_glue_alpha as glue,
7+
aws_iam as iam,
8+
aws_s3 as s3,
9+
aws_logs as logs,
10+
aws_s3_deployment as s3_deployment,
11+
Aspects,Stack,RemovalPolicy,Aws,Duration,CfnOutput
12+
13+
)
14+
from constructs import Construct
15+
16+
import json,os
17+
18+
from cdk_nag import ( AwsSolutionsChecks, NagSuppressions )
19+
20+
class ApplicationStack(Stack):
21+
22+
def create_pypi_repo(self):
23+
artifact_repo = codeartifact.CfnRepository(self,
24+
id=self.pypi_repo_name,
25+
domain_name=self.domain_name,
26+
repository_name=self.pypi_repo_name,
27+
external_connections=["public:pypi"],
28+
description="Provides PyPI artifacts from PyPA.")
29+
return artifact_repo
30+
31+
def create_code_repo(self):
32+
code_repo = codeartifact.CfnRepository(self,
33+
id=self.repo_name,
34+
domain_name=self.domain_name,
35+
repository_name=self.repo_name,
36+
upstreams=[self.pypi_repo_name],
37+
description="Internal python package repository.")
38+
return code_repo
39+
40+
41+
42+
def __init__(self, scope: Construct, construct_id: str, cidr_block: str,**kwargs) -> None:
43+
super().__init__(scope, construct_id, **kwargs)
44+
45+
############################################
46+
##
47+
## CDK Nag - https://pypi.org/project/cdk-nag/
48+
## https://github.com/cdklabs/cdk-nag
49+
##
50+
## CDK Nag Checks for AWS Engagement Solutions Secuirty Rules:
51+
## https://github.com/cdklabs/cdk-nag/blob/main/RULES.md#awssolutions
52+
## Also checks for:
53+
## HIPAA Security
54+
## NIST 800-53 rev 4
55+
## NIST 800-53 rev 5
56+
##
57+
############################################
58+
Aspects.of(self).add(AwsSolutionsChecks())
59+
##
60+
## Supressed Errors
61+
##
62+
NagSuppressions.add_stack_suppressions(self, [{"id":"AwsSolutions-S1", "reason":"TODO: Set *server_access_logs_bucket* and *server_access_logs_prefix* to enable server access logging."}])
63+
NagSuppressions.add_stack_suppressions(self, [{"id":"AwsSolutions-IAM4", "reason":"TODO: Stop using AWS managed policies."}])
64+
NagSuppressions.add_stack_suppressions(self, [{"id":"AwsSolutions-IAM5", "reason":"TODO: Remove Wildcards in IAM roles."}])
65+
NagSuppressions.add_stack_suppressions(self, [{"id":"AwsSolutions-SF2", "reason":"TODO: Set the X-Ray Tracing on the Step Function."}])
66+
NagSuppressions.add_stack_suppressions(self, [{"id":"AwsSolutions-SF1", "reason":"TODO: Set the Step Function CloudWatch Logs log events to 'ALL' "}])
67+
68+
## Variable Initialization
69+
cdk_account_id:str = os.environ["CDK_DEFAULT_ACCOUNT"]
70+
71+
# The code that defines your stack goes here
72+
73+
########################################
74+
##
75+
## VPC
76+
##
77+
#########################################
78+
79+
self.vpc = ec2.Vpc(self, 'enterprise-repo-vpc',
80+
gateway_endpoints={
81+
"S3": ec2.GatewayVpcEndpointOptions(
82+
service=ec2.GatewayVpcEndpointAwsService.S3
83+
)
84+
},
85+
vpc_name = 'enterprise-repo-vpc',
86+
cidr = cidr_block,
87+
max_azs = 1,
88+
enable_dns_hostnames = True,
89+
enable_dns_support = True,
90+
subnet_configuration=[
91+
ec2.SubnetConfiguration(
92+
name = 'Enterprise-Repo-Private-',
93+
subnet_type = ec2.SubnetType.PRIVATE_ISOLATED,
94+
cidr_mask = 26
95+
)
96+
],
97+
)
98+
priv_subnets = [subnet.subnet_id for subnet in self.vpc.private_subnets]
99+
100+
count = 1
101+
for psub in priv_subnets:
102+
ssm.StringParameter(self, 'enterprise-repo-private-subnet-'+ str(count),
103+
string_value = psub,
104+
parameter_name = '/enterprise-repo/private-subnet-'+str(count)
105+
)
106+
count += 1
107+
108+
log_group = logs.LogGroup(self, "enterprise-repo-log-group")
109+
110+
role = iam.Role(self, "enterprise-repo-vpc-flow-log-role",
111+
assumed_by=iam.ServicePrincipal("vpc-flow-logs.amazonaws.com")
112+
)
113+
114+
ec2.FlowLog(self, "enterprise-repo-vpc-flow-log",
115+
resource_type=ec2.FlowLogResourceType.from_vpc(self.vpc),
116+
destination=ec2.FlowLogDestination.to_cloud_watch_logs(log_group, role)
117+
)
118+
119+
########################################
120+
##
121+
## S3 Bucket
122+
##
123+
#########################################
124+
125+
bucket = s3.Bucket(self,
126+
"enterprise-repo-bucket",
127+
bucket_name="codeartifactblog-"+str(cdk_account_id[-5:])+"-"+Aws.REGION,
128+
auto_delete_objects= True,
129+
removal_policy=RemovalPolicy.DESTROY,
130+
block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
131+
encryption= s3.BucketEncryption.S3_MANAGED)
132+
133+
s3_deployment.BucketDeployment(self,
134+
"enterprise-repo-bucket-deployment",
135+
sources=[s3_deployment.Source.asset("./scripts/s3")],
136+
destination_bucket=bucket,
137+
destination_key_prefix="data")
138+
139+
########################################
140+
##
141+
## Code Artifact VPC InterFace Endpoint
142+
##
143+
#########################################
144+
145+
self.vpc.add_interface_endpoint("CodeArtifactEndPoint",
146+
service=ec2.InterfaceVpcEndpointService(f'com.amazonaws.{Aws.REGION}.codeartifact.api'),
147+
subnets=ec2.SubnetType.PRIVATE_ISOLATED)
148+
149+
self.vpc.add_interface_endpoint("CodeArtifactRepositoriesEndPoint",
150+
service=ec2.InterfaceVpcEndpointService(f'com.amazonaws.{Aws.REGION}.codeartifact.repositories'),
151+
subnets=ec2.SubnetType.PRIVATE_ISOLATED,
152+
private_dns_enabled=True)
153+
154+
self.vpc.add_interface_endpoint("GlueRepositoriesEndPoint",
155+
service=ec2.InterfaceVpcEndpointService(f'com.amazonaws.{Aws.REGION}.glue'),
156+
subnets=ec2.SubnetType.PRIVATE_ISOLATED,
157+
private_dns_enabled=True)
158+
159+
########################################
160+
##
161+
## Code Artifact Domain and Repository Creation
162+
##
163+
#########################################
164+
# Name for the pypi repo we create to mirror pypi.
165+
self.domain = None
166+
self.domain_name = 'enterprise-repo-domain'
167+
self.pypi_repo_name = "pypi-store"
168+
self.repo_name= "enterprise-repo"
169+
self.domain = codeartifact.CfnDomain(self, "cfndomain", domain_name=self.domain_name)
170+
171+
self.pypi_repo = self.create_pypi_repo()
172+
self.code_repo = self.create_code_repo()
173+
174+
# Specify the dependencies so the stack can be properly created.
175+
self.pypi_repo.add_depends_on(self.domain)
176+
self.code_repo.add_depends_on(self.pypi_repo)
177+
178+
code_artifact_url = f"https://aws:{{}}@{self.domain_name}-{Aws.ACCOUNT_ID}.d.codeartifact.{Aws.REGION}.amazonaws.com/pypi/{self.repo_name}/simple/"
179+
180+
########################################
181+
##
182+
## Glue Connection
183+
##
184+
#########################################
185+
self.sg_glue_conn = ec2.SecurityGroup(self,
186+
id='sg_demo_glue_conn',
187+
vpc=self.vpc,
188+
allow_all_outbound=True,
189+
description='Security Group for Glue Connection')
190+
self.sg_glue_conn.add_ingress_rule(peer=self.sg_glue_conn,
191+
connection=ec2.Port.all_traffic())
192+
193+
####################################
194+
##
195+
## GLue Job Role Policy
196+
##
197+
####################################
198+
glue_job_role_iam_policy = iam.ManagedPolicy(self,
199+
"GlueJobIamPolicy",
200+
managed_policy_name = 'enterprise-repo-glue-job-policy',
201+
description = "Glue Job IAM Policy")
202+
203+
glue_job_role_iam_policy.add_statements(iam.PolicyStatement(effect =iam.Effect.ALLOW,
204+
actions =["s3:*"],
205+
resources=[""+bucket.bucket_arn+"/*",
206+
""+bucket.bucket_arn+""],))
207+
208+
glue_job_role_iam_policy.add_statements(iam.PolicyStatement(effect =iam.Effect.ALLOW,
209+
actions =["iam:PassRole"],
210+
resources=['*'],
211+
conditions={
212+
'StringLike': {
213+
"iam:PassedToService": ["glue.amazonaws.com"]
214+
}
215+
}))
216+
217+
218+
self.glue_job_role = iam.Role(self,
219+
id="glue_job_role",
220+
role_name="enterprise_repo_glue_job_role",
221+
assumed_by=iam.ServicePrincipal("glue.amazonaws.com"),
222+
path = "/service-role/")
223+
self.glue_job_role.add_managed_policy(glue_job_role_iam_policy)
224+
self.glue_job_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AWSGlueServiceRole"))
225+
226+
########################################
227+
##
228+
## Glue Database
229+
##
230+
#########################################
231+
232+
glue_database = glue.Database(self,
233+
id='enterprise-repo-glue-db',
234+
database_name='codeartifactblog_glue_db')
235+
236+
########################################
237+
##
238+
## Glue Spark
239+
##
240+
#########################################
241+
242+
self.glue_conn = glue.Connection(self, id='enterprise_repo_glue_conn',
243+
type=glue.ConnectionType.NETWORK,
244+
connection_name='enterprise-repo-glue-connection',
245+
security_groups=[self.sg_glue_conn],
246+
subnet=self.vpc.select_subnets(subnet_type=ec2.SubnetType.PRIVATE_ISOLATED).subnets[0])
247+
248+
glue_job = glue.Job(self, "enterprise_repo_spark_etl_job",
249+
executable=glue.JobExecutable.python_etl(glue_version=glue.GlueVersion.V3_0,
250+
python_version=glue.PythonVersion.THREE,
251+
script=glue.Code.from_asset( "./scripts/glue/job.py")),
252+
connections=[self.glue_conn],
253+
role=self.glue_job_role,
254+
worker_count = 3,
255+
job_name = 'enterprise-repo-glue-job',
256+
worker_type = glue.WorkerType.G_1_X,
257+
continuous_logging=glue.ContinuousLoggingProps(enabled=True),
258+
max_retries = 0,
259+
enable_profiling_metrics = True,
260+
timeout=Duration.minutes(20),
261+
default_arguments={'--additional-python-modules': 'awswrangler',
262+
'--class': 'GlueApp',
263+
'--S3_BUCKET': ""+bucket.bucket_name+"",
264+
'--GLUE_DATABASE': ""+glue_database.database_name+"",
265+
'--python-modules-installer-option': ''},
266+
description="an example Python ETL job")
267+
268+
####################################
269+
##
270+
## State Machine Execution Role Policy
271+
##
272+
####################################
273+
sfn_execution_role_iam_policy = iam.ManagedPolicy(self,
274+
"enterprise_repo_sfn_iam_policy",
275+
managed_policy_name = 'enterprise-repo-sfn-policy',
276+
description = "SFN IAM Policy")
277+
278+
sfn_execution_role_iam_policy.add_statements(iam.PolicyStatement(effect =iam.Effect.ALLOW,
279+
actions =["s3:PutObject",
280+
"s3:GetObject"],
281+
resources=[""+bucket.bucket_arn+"/*"]))
282+
283+
########################################
284+
##
285+
## State Machine
286+
##
287+
#########################################
288+
with open('./scripts/statemachine/sfn.json') as f:
289+
json_definition = json.load(f)
290+
291+
json_definition["States"]["GenerateCodeArtifactURL"]["Parameters"]["codeartifacturl.$"] = "States.Format('--index-url="+code_artifact_url.strip()+"', $.taskresult.AuthorizationToken)".strip()
292+
definition = json.dumps(json_definition, indent = 4)
293+
294+
self.sfn_role = iam.Role(self,
295+
id="sfn_role",
296+
role_name="enterprise_repo_sfn_role",
297+
assumed_by=iam.ServicePrincipal("states.amazonaws.com"),
298+
path = "/service-role/")
299+
self.sfn_role.add_managed_policy(sfn_execution_role_iam_policy)
300+
self.sfn_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3ReadOnlyAccess"))
301+
self.sfn_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name("AWSCodeArtifactReadOnlyAccess"))
302+
self.sfn_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AWSGlueServiceRole"))
303+
304+
305+
state_machine = sfn.CfnStateMachine(self,
306+
"enterprise_repo_state_machine",
307+
role_arn=self.sfn_role.role_arn,
308+
state_machine_name='enterprise-repo-step-function',
309+
definition_string=definition,
310+
definition_substitutions={"domain": self.domain_name,
311+
"aws_account_id": Aws.ACCOUNT_ID,
312+
"jobname": glue_job.job_name})
313+
314+
####################################
315+
##
316+
## Cfn Output
317+
##
318+
####################################
319+
320+
CfnOutput(self, "Repository_Name",
321+
value = self.repo_name,
322+
description = "Code Artifact Repository Name"
323+
)
324+
CfnOutput(self, "Domain_Name",
325+
value = self.domain_name,
326+
description = "Code Artifact Domain name for Repository"
327+
)

0 commit comments

Comments
 (0)