> ## Documentation Index
> Fetch the complete documentation index at: https://runpod-b18f5ded-lg-apiv2-new.mintlify.site/llms.txt
> Use this file to discover all available pages before exploring further.

# Create a serverless endpoint

> Creates a serverless endpoint. Specify `gpu` for compute (CPU serverless
endpoints are read-only). ContainerConfig fields can be spread from a
template response.




## OpenAPI

````yaml post /v2/serverless
openapi: 3.0.3
info:
  title: RunPod REST API
  version: 2.0.0
  description: RunPod public REST API — v2
servers:
  - url: /
    description: Current server
security:
  - bearerAuth: []
paths:
  /v2/serverless:
    post:
      tags:
        - Serverless
      summary: Create a serverless endpoint
      description: |
        Creates a serverless endpoint. Specify `gpu` for compute (CPU serverless
        endpoints are read-only). ContainerConfig fields can be spread from a
        template response.
      operationId: createEndpoint
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateEndpointRequest'
      responses:
        '201':
          description: Created
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Endpoint'
        default:
          description: Error
          content:
            application/problem+json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
components:
  schemas:
    CreateEndpointRequest:
      allOf:
        - $ref: '#/components/schemas/ContainerConfig'
        - type: object
          required:
            - name
            - image
            - gpu
          properties:
            name:
              type: string
              minLength: 1
              example: my-inference
            gpu:
              $ref: '#/components/schemas/EndpointGpuConfig'
            workers:
              type: object
              properties:
                min:
                  type: integer
                  minimum: 0
                  default: 0
                max:
                  type: integer
                  minimum: 0
                  default: 3
            scaling:
              type: object
              properties:
                type:
                  $ref: '#/components/schemas/ScalerType'
                value:
                  type: integer
                  minimum: 1
                  default: 4
                idleTimeout:
                  type: integer
                  minimum: 0
                  default: 5
            dataCenterIds:
              type: array
              items:
                type: string
              description: >-
                Preferred data centers for placement. Omit or pass an empty
                array to let the scheduler choose.
            networkVolumes:
              type: array
              items:
                type: string
            timeout:
              type: integer
              default: 300000
            flashboot:
              allOf:
                - $ref: '#/components/schemas/FlashBoot'
              default: 'OFF'
    Endpoint:
      allOf:
        - $ref: '#/components/schemas/ContainerConfig'
        - type: object
          required:
            - id
            - name
            - workers
            - scaling
            - dataCenterIds
            - networkVolumes
            - timeout
            - flashboot
            - createdAt
          properties:
            id:
              type: string
              example: ep_abc123
            name:
              type: string
              example: my-inference
            gpu:
              nullable: true
              allOf:
                - $ref: '#/components/schemas/EndpointGpuConfig'
            cpu:
              nullable: true
              description: >-
                Read-only. Present for CPU serverless endpoints; CPU
                create/update is not yet supported.
              allOf:
                - $ref: '#/components/schemas/CpuConfig'
            workers:
              type: object
              required:
                - min
                - max
              properties:
                min:
                  type: integer
                  minimum: 0
                  example: 0
                max:
                  type: integer
                  minimum: 0
                  example: 5
            scaling:
              type: object
              required:
                - type
                - value
                - idleTimeout
              properties:
                type:
                  $ref: '#/components/schemas/ScalerType'
                value:
                  type: integer
                  minimum: 1
                  example: 4
                idleTimeout:
                  type: integer
                  minimum: 0
                  description: Seconds before idle workers scale down
                  example: 5
            dataCenterIds:
              type: array
              items:
                type: string
              example:
                - US-TX-3
            networkVolumes:
              type: array
              items:
                type: string
              example:
                - vol_abc
            timeout:
              type: integer
              description: Per-request execution timeout in milliseconds
              example: 300000
            flashboot:
              $ref: '#/components/schemas/FlashBoot'
            createdAt:
              type: string
              format: date-time
              example: '2026-03-13T20:00:00Z'
    ErrorResponse:
      type: object
      properties:
        title:
          type: string
          description: Short human-readable summary
          example: Not Found
        status:
          type: integer
          description: HTTP status code
          example: 404
        detail:
          type: string
          description: Human-readable explanation
          example: pod not found
    ContainerConfig:
      type: object
      description: >
        Reusable container configuration shared across templates, pods, and
        serverless endpoints. Adding a field here automatically propagates to
        all three resources.
      properties:
        image:
          type: string
          description: Docker image reference
          example: runpod/pytorch:2.8.0-py3.11-cuda12.8.1
        args:
          type: string
          description: Arguments passed to the container entrypoint
          example: ''
        disk:
          type: integer
          minimum: 1
          description: Container disk in GB (ephemeral, wiped on restart)
          example: 50
        ports:
          type: array
          description: Exposed ports, formatted as port/protocol
          items:
            type: string
          example:
            - 8888/http
            - 22/tcp
        env:
          type: object
          additionalProperties:
            type: string
          description: Environment variables as key-value pairs
          example:
            JUPYTER_PASSWORD: hunter2
        registry:
          type: string
          nullable: true
          description: Container registry credential ID (for private images)
          example: null
    EndpointGpuConfig:
      type: object
      required:
        - pools
      properties:
        pools:
          type: array
          minItems: 1
          description: |
            Serverless GPU pool IDs (as returned by `GET /v2/catalog/gpus` in
            `pool`). Workers are placed on whichever listed pool has capacity.
          items:
            type: string
          example:
            - ADA_24
        count:
          type: integer
          minimum: 1
          default: 1
          description: GPUs per worker
          example: 1
    ScalerType:
      type: string
      description: |
        Autoscaling strategy.
        - `QUEUE_DELAY`   — scale on seconds a request waits in queue
        - `REQUEST_COUNT` — scale on in-flight request count
      enum:
        - QUEUE_DELAY
        - REQUEST_COUNT
    FlashBoot:
      type: string
      description: |
        FlashBoot cold-start acceleration mode.
        - `OFF`                — disabled
        - `FLASHBOOT`          — enabled
        - `PRIORITY_FLASHBOOT` — enabled with priority capacity
      enum:
        - 'OFF'
        - FLASHBOOT
        - PRIORITY_FLASHBOOT
    CpuConfig:
      allOf:
        - $ref: '#/components/schemas/BaseCpuConfig'
        - type: object
          required:
            - memory
          properties:
            memory:
              type: integer
              minimum: 1
              description: Memory allocated to the pod in GB.
              example: 16
    BaseCpuConfig:
      type: object
      required:
        - id
        - vcpuCount
      properties:
        id:
          type: string
          description: CPU flavor identifier, as returned by GET /v2/catalog/cpus.
          example: cpu5c
          minLength: 1
        vcpuCount:
          type: integer
          minimum: 2
          description: >-
            Number of vCPUs. Must be valid for the selected CPU flavor and must
            be a power of two.
          example: 4
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: RunPod API Key

````