Published on

How to set up AWS RDS IAM database authentication with Ecto

Using AWS (2024-03-24)

Recently I had to switch from ExAWS to AWS. I’m going to quickly run through how I set it up with AWS, but you can read the original post below.

TLDR: We want to use RDS IAM to connect to our DB, so we have to generate a shortlived token based on the current AWS role (e.g. ECS instance) as the database password.

First, add the dependencies:

defp deps do
  # ...

  # To generate the presigned url and for working with AWS
  {:aws, "~> 0.14.0"},

  # Alternatively you can just include `:aws_signature` if you are not
  # going to use `AWS` for anything.
  # {:aws_signature, "~> 0.3"}

  # To fetch AWS credentials 
  #
  # We don't want `:aws_credentials` to start in dev and test since
  # the startup will fail if no credentials can be fetched.
  {:aws_credentials, "~> 0.2.1", runtime: Mix.env() == :prod},

  # CAStore for connecting to RDS with TLS
  {:aws_rds_castore, "~> 1.1"},
end

We only start :aws_credentials in production as :aws_credentials halts the application startup if it can’t fetch any AWS credentials.

Now set up config/runtime.exs to fetch the required arguments:

config :my_app, MyApp.Repo,
  configure: {
    MyApp.Repo, 
    :configure_with_auth_token,
    [
      host: host,
      username: username,
      dbname: dbname,
      port: port,
      region: region
    ]
  }

I store these arguments in a single environment variable as JSON and decode it before setting them here.

Also make sure :aws_credentials is started when running migrations in config/runtime.exs:

start_apps_before_migration =
  :my_app
  |> Application.fetch_env!(MyApp.Repo)
  |> Keyword.fetch!(:start_apps_before_migration)
  |> Kernel.++([:aws_credentials])

config :my_app, MyApp.Repo, start_apps_before_migration: start_apps_before_migration

We put this in config/runtime.exs as :aws_credentials will fail startup when it can’t fetch the AWS credentials. We don’t want it to run in the development and test environments.

Finally set up configure_with_auth_token/2 in your repo module:

defmodule MyApp.Repo do
  use Ecto.Repo,
    otp_app: :my_app,
    adapter: Ecto.Adapters.Postgres

  # Helper function to configure the connection with dynamically generated
  # auth token for the IAM instance role
  def configure_with_auth_token(opts, credentials) do
    hostname = Keyword.fetch!(credentials, :host)
    username = Keyword.fetch!(credentials, :username)
    port = Keyword.fetch!(credentials, :port)
    dbname = Keyword.fetch!(credentials, :dbname)
    region = Keyword.fetch!(credentials, :region)

    aws_credentials = :aws_credentials.get_credentials()
    auth_token = rds_auth_token(aws_credentials, hostname, port, username, region)

    Keyword.merge(opts, [
      hostname: hostname,
      port: port,
      username: username,
      password: auth_token,
      database: dbname,
      ssl: true,
      ssl_opts: AwsRdsCAStore.ssl_opts(hostname)
    ])
  end

  defp rds_auth_token(aws_credentials, hostname, port, username, region, opts \\ [ttl: 900]) do
    access_key = Map.fetch!(aws_credentials, :access_key_id)
    secret_key = Map.fetch!(aws_credentials, :secret_access_key)
    datetime = :erlang.universaltime()
    url = "https://#{hostname}:#{port}/?Action=connect&DBUser=#{username}"

    signed_url =
      :aws_signature.sign_v4_query_params(
        access_key,
        secret_key,
        region,
        "rds-db",
        datetime,
        url,
        opts
      )

    String.trim_leading(signed_url, "https://")
  end
end

I still had to keep the wait_for_connection logic in place from further down to prevent flakey deployments.

Assuming role

To connect to a database cross-account you must assume a role before generating the token. This wasn’t too bad using AWS, but did take a minute to get right. Instead of calling :aws_credentials.get_credentials/0 in MyApp.Repo.configure_with_auth_token/2, we’ll assume the role first:

defmodule MyApp.Repo do
  # ...
  defp get_aws_credentials(db_credentials, region) do
    aws_credentials = :aws_credentials.get_credentials()

    case Keyword.get(db_credentials, :role_arn) do
      nil -> aws_credentials
      role_arn -> assume_role(aws_credentials, region, role_arn)
    end
  end

  defp assume_role(aws_credentials, region, role_arn, opts \\ [ttl: 900]) do
    access_key = Map.fetch!(aws_credentials, :access_key_id)
    secret_key = Map.fetch!(aws_credentials, :secret_access_key)
    session_token = Map.fetch!(aws_credentials, :token)
    client = AWS.Client.create(access_key, secret_key, session_token, region)

    input = %{
      "DurationSeconds" => Map.fetch!(opts, :ttl),
      "RoleArn" => role_arn,
      "RoleSessionName" => to_string(__MODULE__)
    }

    case AWS.STS.assume_role(client, input) do
      {:ok, %{
        "AssumeRoleResponse" => %{
          "AssumeRoleResult" => %{
            "Credentials" => %{
              "AccessKeyId" => access_key_id,
              "SecretAccessKey" => secret_access_key,
              "SessionToken" => session_token
            }
          }
        }
      }, _response} ->
        %{
          access_key_id: access_key_id,
          secret_access_key: secret_access_key,
          token: session_token
        }

      {:error, error} ->
        raise error
    end
  end
end

Now in rds_auth_token/6 you must pull out the session token as we:

defp rds_auth_token(aws_credentials, hostname, port, username, region, opts \\ [ttl: 900]) do
  # ...
  session_token = :uri_string.quote(Map.fetch!(aws_credentials, :token))
  opts = Keyword.put(opts, :session_token, session_token)
  # ...
end

Remember to add :role_arn to your MyApp.Repo config.

Logging startup errors

Any exception raised in the configure_with_auth_token/4 callback didn’t show up in the logs. To understand what went wrong when the app starts to fail, I had catch any exceptions and print them before reraising to halt the application startup:

defmodule MyApp.Repo do
  def configure_with_auth_token(opts, credentials) do
    # ...
  rescue
    error ->
      # If there are any issues with starting the supervisor the whole app
      # will be shut down, so we want to print early here.
      IO.warn(Exception.format_banner(:error, error, __STACKTRACE__))

      reraise error, __STACKTRACE__
  end
end

ExAWS (2023-11-28)

AWS RDS supports IAM database authentication. This means that we don’t have to deal with password rotation and can instead use shortlived tokens as database passwords!

To set up RDS IAM database authentication you need to enable RDS IAM authentication first and ensure that your database user has RDS IAM authentication enabled.

In postgres this requires running a GRANT rds_iam TO REPLACE_WITH_DB_USERNAME; query.

The token must be generated and used as the database password each time Ecto sets up a connection as the token will only be valid for 15 minutes.

We’ll use ExAWS in the example below.

First we’ll add the dependencies:

defp deps do
  # ...

  # To generate the token
  {:ex_aws, "~> 2.4"},

  # CAStore for connecting to RDS with TLS
  {:aws_rds_castore, "~> 1.1"},
end

Now we’ll update the repo config. Since the URL is generated on demand we don’t need to set the :url option. Instead, we will use the :configure callback. This is what I have in my config/runtime.exs:

config :my_app, MyApp.Repo,
  configure: {
    MyApp.Repo, 
    :configure_with_auth_token,
    [
      host: host,
      username: username,
      dbname: dbname,
      port: port
    ]
  }

And you should also make sure that ExAws is started when running migrations in config/config.exs:

config :my_app, MyApp.Repo,
  # ExAWS is used in prod to generate the IAM DB password token
  start_apps_before_migration: [:ssl, :logger, :ex_aws]

The last piece is to implement the configure_with_auth_token/2 function:

defmodule MyApp.Repo do
  use Ecto.Repo,
    otp_app: :my_app,
    adapter: Ecto.Adapters.Postgres

  # Helper function to configure the connection with dynamically generated
  # auth token for the IAM instance role
  def configure_with_auth_token(opts, credentials) do
    hostname = Keyword.fetch!(credentials, :host)
    username = Keyword.fetch!(credentials, :username)
    port = Keyword.fetch!(credentials, :port)
    dbname = Keyword.fetch!(credentials, :dbname)
    region = Keyword.fetch!(credentials, :region)

    aws_credentials = :aws_credentials.get_credentials()
    auth_token = rds_auth_token(aws_credentials, hostname, port, username, region)

    Keyword.merge(opts, [
      hostname: hostname,
      port: port,
      username: username,
      password: auth_token,
      database: dbname,
      ssl: true,
      ssl_opts: AwsRdsCAStore.ssl_opts(hostname)
    ])
  end

  defp rds_auth_token(aws_credentials, hostname, port, username, region, opts \\ [ttl: 900]) do
    access_key = Map.fetch!(aws_credentials, :access_key_id)
    secret_key = Map.fetch!(aws_credentials, :secret_access_key)
    datetime = :erlang.universaltime()
    url = "https://#{hostname}:#{port}/?Action=connect&DBUser=#{username}"

    signed_url =
      :aws_signature.sign_v4_query_params(
        access_key,
        secret_key,
        region,
        "rds-db",
        datetime,
        url,
        opts
      )

    String.trim_leading(signed_url, "https://")
  end
end

Now you got RDS IAM database authentication running!

Flakey deployments during migrations with ExAws

I saw flakey deployments when I first implemented this in my ECS cluster. Migrations were running in an ECS task and frequently failed after a few seconds. There were no helpful error messages in the log, all it explained was:

[error] Could not create schema migrations table. This error usually happens due to the following:
...
** (DBConnection.ConnectionError) connection not available and request was dropped from queue after 2967ms. This means requests are coming in and your connection pool cannot serve them fast enough. You can address this by:
...

I found that this was a combination of both ExAWS and the Ecto.Migrator.run/3 call. ExAWS takes a bit before it has the instance credentials available in the ExAWS.Config.AuthCache GenServer, and Ecto.Migrator.run/3 won’t wait for a connection to be established with the :configure callback before running the query.

What we have to do is to force the migration task to wait until we have an established connection. We could increase :queue_target and :queue_interval, but I felt it was better to just wait until the connection had been established.

First, we’ll update our release module to call wait_for_connection/1:

defmodule MyApp.Release do
  # ...

  def migrate do
    load_app()

    for repo <- repos() do
      {:ok, _, _} =
        Ecto.Migrator.with_repo(repo, fn repo ->
          wait_for_connection(repo)
          Ecto.Migrator.run(repo, :up, all: true)
        end)
    end
  end

  def rollback(repo, version) do
    load_app()

    {:ok, _, _} =
      Ecto.Migrator.with_repo(repo, fn repo ->
        wait_for_connection(repo)
        Ecto.Migrator.run(repo, :down, to: version)
      end)
  end

  # ...
end

Now we implement wait_for_connection/1 that will run a query every interval (every 50ms) until it gets a connection or times out (after 30s):

defmodule MyApp.Release do
  # ...

  @interval 50
  @timeout :timer.seconds(30)

  # Due to ExAWS being slow we want to give the migration task enough time
  # to establish a connection before running the migrations.
  defp wait_for_connection(repo) do
    wait_for_connection(repo, System.monotonic_time(), 0)
  end

  defp wait_for_connection(repo, _start, time) when time >= @timeout do
    raise "Could not establish a connection with #{inspect repo} after #{time}ms"
  end

  defp wait_for_connection(repo, start, _time) do
    case canary(repo) do
      :ok ->
        :ok

      :error ->
        stop = System.monotonic_time()
        time = System.convert_time_unit(stop - start, :native, :millisecond)

        :timer.sleep(@interval)
        wait_for_connection(repo, start, time)
    end
  end

  defp canary(repo) do
    case repo.query("SELECT 1") do
      {:ok, %{rows: [[1]]}} -> :ok
      _ -> :error
    end
  rescue
    _ in DBConnection.ConnectionError -> :error
  end

  # ...
end

This resolved the flakey deployments.

Hi, I'm Dan Schultzer, I write in this blog, work a lot in Elixir, maintain several open source projects, and help companies streamline their development process