AWS CloudformationでMLFlowサーバを構築した

AWS CloudformationでMLFlowサーバを構築した。MLFlowサーバのバックエンドには、S3とRDSを使用している。

AWS Cloudformationで以下のYAMLファイルでStackを構築すれば、VPCやSubnet、ロール、MLFlowがインストールされたEC2インスタンスなどが自動的に構築される。VPCのCIDRがかぶっていたりすると構築に失敗するので、その場合は変数をいい感じに編集して使用する。

AWSTemplateFormatVersion: '2010-09-09'
Parameters:
  VpcCidrBlock:
    Type: String
    Description: CIDR block for VPC
    Default: 192.168.0.0/16
  PublicSubnetCidrBlock1:
    Type: String
    Description: CIDR block for subnet
    Default: 192.168.1.0/24
  PublicSubnetCidrBlock2:
    Type: String
    Description: CIDR block for subnet
    Default: 192.168.2.0/24
  LocalSubnetCidrBlock1:
    Type: String
    Description: CIDR block for subnet
    Default: 192.168.11.0/24
  LocalSubnetCidrBlock2:
    Type: String
    Description: CIDR block for subnet
    Default: 192.168.12.0/24
  MLFlowPrivateIpAddress:
    Type: String
    Description: MLFlow Private IP Address
    Default: 192.168.1.10

  KeyName:
    Type: AWS::EC2::KeyPair::KeyName
    Description: EC2 Key Pair for SSH access
    Default: MyKeyPair
  InstanceType:
    Type: String
    Description: EC2 instance type
    Default: t2.micro
  RDSInstanceClass:
    Type: String
    Description: RDS instance class
    Default: db.t2.micro
  DBName:
    Type: String
    Description: RDS database name
    Default: mlflowdb
  S3Name:
    Type: String
    Description: S3 name
    Default: mlflows300
  MasterUsername:
    Type: String
    Description: RDS master username
    Default: mlflowuser
  MasterUserPassword:
    Type: String
    Description: RDS master user password
    NoEcho: true
    Default: mlflowpassword

Resources:
  VPC:
    Type: 'AWS::EC2::VPC'
    Properties:
      EnableDnsSupport: "true"
      EnableDnsHostnames: "true"
      CidrBlock: !Ref VpcCidrBlock
  InternetGateway:
    Type: 'AWS::EC2::InternetGateway'
    Properties:
      Tags:
        - Key: Name
          Value: !Sub ${AWS::StackName}-InternetGateway
  AttachGateway:
    Type: 'AWS::EC2::VPCGatewayAttachment'
    Properties:
      VpcId: !Ref VPC
      InternetGatewayId: !Ref InternetGateway

  PublicSubnet1:
    Type: 'AWS::EC2::Subnet'
    Properties:
      VpcId: !Ref VPC
      CidrBlock: !Ref PublicSubnetCidrBlock1
      AvailabilityZoneId: "apne1-az1"
      MapPublicIpOnLaunch: true
  PublicSubnet2:
    Type: 'AWS::EC2::Subnet'
    Properties:
      VpcId: !Ref VPC
      CidrBlock: !Ref PublicSubnetCidrBlock2
      AvailabilityZoneId: "apne1-az2"
      MapPublicIpOnLaunch: true
  LocalSubnet1:
    Type: 'AWS::EC2::Subnet'
    Properties:
      VpcId: !Ref VPC
      CidrBlock: !Ref LocalSubnetCidrBlock1
      AvailabilityZoneId: "apne1-az1"
  LocalSubnet2:
    Type: 'AWS::EC2::Subnet'
    Properties:
      VpcId: !Ref VPC
      CidrBlock: !Ref LocalSubnetCidrBlock2
      AvailabilityZoneId: "apne1-az2"
  DBSubnetGroup:
    Type: AWS::RDS::DBSubnetGroup
    Properties:
      DBSubnetGroupDescription: Subnet group
      SubnetIds: 
        - !Ref LocalSubnet1
        - !Ref LocalSubnet2

  #############################################################################
  # public Subnet
  #############################################################################   
  RouteTable1:
    Type: 'AWS::EC2::RouteTable'
    Properties:
      VpcId: !Ref VPC
      Tags:
        - Key: Name
          Value: !Sub ${AWS::StackName}-RouteTable
  PublicRoute1:
    Type: 'AWS::EC2::Route'
    Properties:
      RouteTableId: !Ref RouteTable1
      DestinationCidrBlock: 0.0.0.0/0
      GatewayId: !Ref InternetGateway
  SubnetRouteTableAssociation1:
    Type: 'AWS::EC2::SubnetRouteTableAssociation'
    Properties:
      SubnetId: !Ref PublicSubnet1
      RouteTableId: !Ref RouteTable1

  RouteTable2:
    Type: 'AWS::EC2::RouteTable'
    Properties:
      VpcId: !Ref VPC
      Tags:
        - Key: Name
          Value: !Sub ${AWS::StackName}-RouteTable
  PublicRoute2:
    Type: 'AWS::EC2::Route'
    Properties:
      RouteTableId: !Ref RouteTable2
      DestinationCidrBlock: 0.0.0.0/0
      GatewayId: !Ref InternetGateway
  SubnetRouteTableAssociation2:
    Type: 'AWS::EC2::SubnetRouteTableAssociation'
    Properties:
      SubnetId: !Ref PublicSubnet2
      RouteTableId: !Ref RouteTable2

 
  #############################################################################
  # MLflow RESOURCES SECTION
  # This section contains all the MLflow related resources
  #############################################################################       
  MLflowIAMRole:
    Type: 'AWS::IAM::Role'
    Properties:
      RoleName: TestManager
      AssumeRolePolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Effect: Allow
            Principal:
              Service: ec2.amazonaws.com
            Action: 'sts:AssumeRole'
      ManagedPolicyArns:
        - arn:aws:iam::aws:policy/AmazonS3FullAccess
        - arn:aws:iam::aws:policy/AmazonRDSFullAccess
  MLflowInstanceProfile:
    Type: AWS::IAM::InstanceProfile
    Properties:
      Path: "/"
      Roles:
      - !Ref MLflowIAMRole

  MLflowEC2Instance:
    Type: AWS::EC2::Instance
    DependsOn:
      - MLflowRDSInstance
      - MLflowS3Bucket01
      - MLflowSecurityGroup
    Properties:
      ImageId: ami-0dfa284c9d7b2adad  # Specify the appropriate AMI ID for your region and instance type
      InstanceType: !Ref InstanceType
      IamInstanceProfile: !Ref MLflowInstanceProfile
      KeyName: !Ref KeyName
      UserData: 
        Fn::Base64: !Sub
        - |
          #!/bin/bash
          FLAG_FILE=/var/log/first_run_completed

          # only first run
          if [ ! -f $FLAG_FILE ]; then
          sudo yum update -y
          sudo yum install -y python3 python3-pip
          sudo yum clean all
          sudo su ec2-user -c 'pip3 install -U pip'
          sudo su ec2-user -c 'pip3 install mlflow boto3 pymysql'
          sudo su ec2-user -c 'pip3 cache purge'
          export DB_ENDPOINT=`aws rds describe-db-instances --query "DBInstances[?contains(DBInstanceIdentifier, 'mlflow')].Endpoint.Address | [0]" --output text`
          echo "python3 -m mlflow server \
            --host 0.0.0.0 \
            --port 5000 \
            --backend-store-uri mysql+pymysql://${USR}:${PW}@${!DB_ENDPOINT}:3306/${DB_NAME} \
            --default-artifact-root s3://${S3_NAME}/mlflow-artifacts/" > /etc/start_mlflow.sh
          chmod 777 /etc/start_mlflow.sh
          fi
          # start mlflow server
          nohup sudo su ec2-user -c "/etc/start_mlflow.sh" & 
        - {
            USR: !Ref MasterUsername,
            PW: !Ref MasterUserPassword,
            DB_NAME: !Ref DBName,
            S3_NAME: !Ref S3Name
          }
      NetworkInterfaces: 
        - AssociatePublicIpAddress: "true"
          DeviceIndex: "0"
          GroupSet: 
            - !Ref MLflowSecurityGroup
          SubnetId: !Ref PublicSubnet1
          PrivateIpAddress: !Ref MLFlowPrivateIpAddress
  MLflowElasticIp:
    Type: 'AWS::EC2::EIP'
  MLflowInstanceAssociation:
    Type: 'AWS::EC2::EIPAssociation'
    Properties:
      AllocationId: !GetAtt MLflowElasticIp.AllocationId
      InstanceId: !Ref MLflowEC2Instance

  MLflowSecurityGroup:
    Type: 'AWS::EC2::SecurityGroup'
    Properties:
      GroupDescription: Security group for MLflow server
      VpcId: !Ref VPC
      SecurityGroupIngress:
        - IpProtocol: tcp
          FromPort: 5000
          ToPort: 5000
          CidrIp: 0.0.0.0/0
        - IpProtocol: tcp
          FromPort: 22
          ToPort: 22
          CidrIp: 0.0.0.0/0

  DbSecurityByEC2SecurityGroup:
    Type: AWS::RDS::DBSecurityGroup
    DependsOn:
      - MLflowSecurityGroup
    Properties:
      GroupDescription: Ingress for Amazon EC2 security group
      EC2VpcId: !Ref VPC
      DBSecurityGroupIngress:
      - EC2SecurityGroupId: !Ref MLflowSecurityGroup

  MLflowRDSInstance:
    Type: 'AWS::RDS::DBInstance'
    Properties:
      DBSecurityGroups:
        - !Ref DbSecurityByEC2SecurityGroup
      DBSubnetGroupName: !Ref DBSubnetGroup
      AllocatedStorage: 5
      DBInstanceClass: !Ref RDSInstanceClass
      Engine: mysql  # Choose the appropriate RDS engine
      MasterUsername: !Ref MasterUsername
      MasterUserPassword: !Ref MasterUserPassword
      DBName: !Ref DBName

  MLflowS3Bucket01:
    Type: 'AWS::S3::Bucket'
    Properties:
      BucketName: !Ref S3Name

  #############################################################################
  # Model developer
  #############################################################################
  ModelDeveloperIAMRole:
    Type: 'AWS::IAM::Role'
    Properties:
      RoleName: ModelDeveloper
      AssumeRolePolicyDocument:
        Version: '2012-10-17'
        Statement:
          - Effect: Allow
            Principal:
              Service: ec2.amazonaws.com
            Action: 'sts:AssumeRole'
      ManagedPolicyArns:
        - arn:aws:iam::aws:policy/AmazonS3FullAccess
  ModelDeveloperInstanceProfile:
    Type: AWS::IAM::InstanceProfile
    Properties:
      Path: "/"
      Roles:
      - !Ref ModelDeveloperIAMRole
  ModelDeveloperEC2Instance:
    Type: AWS::EC2::Instance
    Properties:
      ImageId: ami-07c589821f2b353aa  # Specify the appropriate AMI ID for your region and instance type
      InstanceType: !Ref InstanceType
      IamInstanceProfile: !Ref ModelDeveloperInstanceProfile
      KeyName: !Ref KeyName
      UserData: 
        Fn::Base64: !Sub
        - |
          #!/bin/bash
          sudo apt update -y
          sudo apt install -y python3 python3-pip
          su ec2-user <<-EOF
          pip3 install -U pip 
          pip3 install boto3
          EOF
        - {}
      NetworkInterfaces: 
        - AssociatePublicIpAddress: "true"
          DeviceIndex: "0"
          GroupSet: 
            - !Ref ModelDeveloperSecurityGroup
          SubnetId: !Ref PublicSubnet1

  ModelDeveloperSecurityGroup:
    Type: 'AWS::EC2::SecurityGroup'
    Properties:
      GroupDescription: Security group for Model Trainig server
      VpcId: !Ref VPC
      SecurityGroupIngress:
        - IpProtocol: tcp
          FromPort: 22
          ToPort: 22
          CidrIp: 0.0.0.0/0

Outputs:
  # Emit values needed for deployment status (e.g., where to SSH to)
  MLflowEC2IPAddress:
    Description: "MLflow EC2 Instance Public IP Address"
    Value: !GetAtt MLflowEC2Instance.PublicIp