Skip to content

Commit f3ce05f

Browse files
authored
Merge #2 SMProcessing for image preprocessing
Refactor PDF splitting and image pre-processing from notebook script to a SageMaker Processing Job for better scalability. Use sm-docker build CLI to build the required custom container from the notebook. Fixes #1
2 parents 22f954e + 4bee119 commit f3ce05f

File tree

10 files changed

+1353
-826
lines changed

10 files changed

+1353
-826
lines changed

annotation/__init__.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,15 @@
77
"""
88
# Python Built-Ins:
99
import os
10+
from typing import List
1011

1112
# External Dependencies:
1213
from aws_cdk import core as cdk
1314
from aws_cdk.aws_iam import (
15+
Effect,
1416
ManagedPolicy,
17+
PolicyDocument,
18+
PolicyStatement,
1519
Role,
1620
ServicePrincipal,
1721
)
@@ -30,6 +34,138 @@ class AnnotationInfra(cdk.Construct):
3034
def __init__(self, scope: cdk.Construct, id: str, **kwargs):
3135
super().__init__(scope, id, **kwargs)
3236

37+
self.sm_image_build_role = Role(
38+
self,
39+
"SMImageBuildRole",
40+
assumed_by=ServicePrincipal("codebuild.amazonaws.com"),
41+
description=(
42+
"CodeBuild Role for data scientist to build ECR containers for OCR preprocessing"
43+
),
44+
inline_policies={
45+
"OCRPipelineImageBuild": PolicyDocument(
46+
# Scoped down from permissions defined by the sagemaker-studio-image-build-cli:
47+
# https://github.com/aws-samples/sagemaker-studio-image-build-cli
48+
statements=[
49+
PolicyStatement(
50+
sid="CreateCodeBuildLogStreams",
51+
actions=["logs:CreateLogStream"],
52+
effect=Effect.ALLOW,
53+
resources=[
54+
"arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*",
55+
],
56+
),
57+
PolicyStatement(
58+
sid="CreateLogGroups",
59+
actions=["logs:CreateLogGroup"],
60+
effect=Effect.ALLOW,
61+
resources=["*"],
62+
),
63+
PolicyStatement(
64+
sid="CodeBuildLogEvents",
65+
actions=[
66+
"logs:GetLogEvents",
67+
"logs:PutLogEvents",
68+
],
69+
effect=Effect.ALLOW,
70+
resources=[
71+
"arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*",
72+
],
73+
),
74+
PolicyStatement(
75+
sid="ECRLogInToken",
76+
actions=["ecr:GetAuthorizationToken"],
77+
effect=Effect.ALLOW,
78+
resources=["*"],
79+
),
80+
PolicyStatement(
81+
sid="ECRReadWrite",
82+
actions=[
83+
"ecr:CreateRepository",
84+
"ecr:BatchGetImage",
85+
"ecr:CompleteLayerUpload",
86+
"ecr:DescribeImages",
87+
"ecr:DescribeRepositories",
88+
"ecr:UploadLayerPart",
89+
"ecr:ListImages",
90+
"ecr:InitiateLayerUpload",
91+
"ecr:BatchCheckLayerAvailability",
92+
"ecr:PutImage",
93+
],
94+
effect=Effect.ALLOW,
95+
resources=[
96+
# We'll only allow a specific repo name, rather than any default:
97+
# "arn:aws:ecr:*:*:repository/sagemaker-studio*",
98+
"arn:aws:ecr:*:*:repository/sm-scikit-ocrtools",
99+
],
100+
),
101+
PolicyStatement(
102+
sid="AccessPreBuiltAWSImages",
103+
actions=[
104+
"ecr:BatchGetImage",
105+
"ecr:GetDownloadUrlForLayer",
106+
],
107+
effect=Effect.ALLOW,
108+
resources=[
109+
"arn:aws:ecr:*:121021644041:repository/*",
110+
"arn:aws:ecr:*:763104351884:repository/*",
111+
"arn:aws:ecr:*:217643126080:repository/*",
112+
"arn:aws:ecr:*:727897471807:repository/*",
113+
"arn:aws:ecr:*:626614931356:repository/*",
114+
"arn:aws:ecr:*:683313688378:repository/*",
115+
"arn:aws:ecr:*:520713654638:repository/*",
116+
"arn:aws:ecr:*:462105765813:repository/*",
117+
],
118+
),
119+
PolicyStatement(
120+
sid="BundleCodeToS3",
121+
actions=[
122+
"s3:GetObject",
123+
"s3:DeleteObject",
124+
"s3:PutObject",
125+
],
126+
effect=Effect.ALLOW,
127+
resources=[
128+
# Tightened this up a bit vs the default:
129+
# "arn:aws:s3:::sagemaker-*/*"
130+
"arn:aws:s3:::sagemaker-*/codebuild-sagemaker-container-*"
131+
],
132+
),
133+
# Omit this one because the user should have it already per our guidance,
134+
# and if they don't already it's probably best to fail than quietly grant:
135+
# PolicyStatement(
136+
# sid="CreateSageMakerDefaultBucketIfMissing",
137+
# actions=["s3:CreateBucket"],
138+
# effect=Effect.ALLOW,
139+
# resources=["arn:aws:s3:::sagemaker*"],
140+
# ),
141+
# Only required if not explicitly passing a --role (which we will):
142+
# PolicyStatement(
143+
# sid="LookUpIAMRoles",
144+
# actions=["iam:GetRole", "iam:ListRoles"],
145+
# effect=Effect.ALLOW,
146+
# resources=["*"],
147+
# ),
148+
# Only required if building within VPCs (which we won't):
149+
# PolicyStatement(
150+
# sid="VPCAccess",
151+
# actions=[
152+
# "ec2:CreateNetworkInterface",
153+
# "ec2:CreateNetworkInterfacePermission",
154+
# "ec2:DescribeDhcpOptions",
155+
# "ec2:DescribeNetworkInterfaces",
156+
# "ec2:DeleteNetworkInterface",
157+
# "ec2:DescribeSubnets",
158+
# "ec2:DescribeSecurityGroups",
159+
# "ec2:DescribeVpcs"
160+
# ],
161+
# effect=Effect.ALLOW,
162+
# resources=["*"],
163+
# ),
164+
],
165+
),
166+
},
167+
)
168+
33169
self.lambda_role = Role(
34170
self,
35171
"SMGT-LambdaRole",
@@ -83,3 +219,29 @@ def pre_lambda(self):
83219
@property
84220
def post_lambda(self):
85221
return self._post_lambda
222+
223+
def get_data_science_policy_statements(self) -> List[PolicyStatement]:
224+
"""Generate policy statements required for data scientist to use the annotation infra"""
225+
return [
226+
PolicyStatement(
227+
sid="PassSMImageBuildRole",
228+
actions=["iam:PassRole"],
229+
resources=[self.sm_image_build_role.role_arn],
230+
conditions={
231+
"StringLikeIfExists": {
232+
"iam:PassedToService": "codebuild.amazonaws.com",
233+
},
234+
},
235+
),
236+
PolicyStatement(
237+
sid="EditSMStudioCodeBuildProjects",
238+
actions=[
239+
"codebuild:DeleteProject",
240+
"codebuild:CreateProject",
241+
"codebuild:BatchGetBuilds",
242+
"codebuild:StartBuild",
243+
],
244+
effect=Effect.ALLOW,
245+
resources=["arn:aws:codebuild:*:*:project/sagemaker-studio*"],
246+
),
247+
]

cdk_demo_stack.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def __init__(
106106
sid="RunPlainTextractStateMachine",
107107
),
108108
]
109+
+ self.annotation_infra.get_data_science_policy_statements()
109110
+ self.pipeline.config_read_write_statements()
110111
# In the notebooks we'll use the same execution role for the trained model/endpoint
111112
# as the notebook itself runs with - so need to grant the role the required perms
@@ -191,6 +192,16 @@ def __init__(
191192
# able to automatically look up project resources from SageMaker notebooks. To support
192193
# this, we'll create additional SSM params used just to *retrieve* static attributes of the
193194
# stack - rather than configuration points like the ProcessingPipeline construct's params.
195+
self.sm_image_build_role_ssm_param = ssm.StringParameter(
196+
self,
197+
"SMImageBuildRoleSSMParam",
198+
string_value=self.annotation_infra.sm_image_build_role.role_name,
199+
description=(
200+
"Name of the CodeBuild execution role to use in SMStudio Image Build commands"
201+
),
202+
parameter_name=f"/{self.project_id_param.value_as_string}/static/SMDockerBuildRole",
203+
simple_name=False,
204+
)
194205
self.input_bucket_ssm_param = ssm.StringParameter(
195206
self,
196207
"InputBucketNameSSMParam",
@@ -239,6 +250,7 @@ def __init__(
239250
self.data_science_policy.add_statements(
240251
SsmParameterReadStatement(
241252
resources=[
253+
self.sm_image_build_role_ssm_param,
242254
self.input_bucket_ssm_param,
243255
self.reviews_bucket_ssm_param,
244256
self.pipeline_statemachine_ssm_param,

0 commit comments

Comments
 (0)