english/.opencode/skills/google-adk-python/references/deployment-cloud-run-vertex-gke.md

# Deployment: Cloud Run, Vertex AI, GKE

## Development Modes

```bash
adk web samples/agents/my_agent.py:agent --port 8080
adk run samples/agents/my_agent.py:agent "What is 2+2?" --streaming
adk api_server samples/agents/my_agent.py:agent --port 8000
```

Endpoints: `/chat`, `/stream`, `/health`

## Cloud Run

```dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
COPY pyproject.toml uv.lock ./
COPY src/ ./src/
RUN uv sync --frozen --no-cache
EXPOSE 8080
CMD ["uv", "run", "adk", "api_server", "src/my_agent.py:agent", "--host", "0.0.0.0", "--port", "8080"]
```

```bash
export PROJECT_ID=my-project REGION=us-central1
gcloud builds submit --tag gcr.io/$PROJECT_ID/my-agent
gcloud run deploy my-agent \
  --image gcr.io/$PROJECT_ID/my-agent \
  --region $REGION \
  --set-env-vars GOOGLE_API_KEY=$GOOGLE_API_KEY

# Secret Manager
echo -n "key" | gcloud secrets create google-api-key --data-file=-
gcloud run deploy my-agent --set-secrets GOOGLE_API_KEY=google-api-key:latest
```

## Vertex AI

```bash
adk deploy --target vertex --agent my_agent.py:agent --project my-project
```

```yaml
agent:
  name: my-agent
  model: gemini-2.5-flash
  region: us-central1
  scaling: {min_instances: 1, max_instances: 10}
  resources: {cpu: 2, memory: 4Gi}
```

```python
from google.cloud import aiplatform
aiplatform.init(project='my-project', location='us-central1')
endpoint = aiplatform.Endpoint('projects/123/locations/us-central1/endpoints/456')
response = endpoint.predict(instances=[{'prompt': 'What is 2+2?'}])
```

## GKE

```yaml
apiVersion: apps/v1
kind: Deployment
metadata: {name: my-agent}
spec:
  replicas: 3
  template:
    spec:
      containers:
      - name: agent
        image: gcr.io/my-project/my-agent:latest
        ports: [{containerPort: 8080}]
        env:
        - name: GOOGLE_API_KEY
          valueFrom: {secretKeyRef: {name: google-api-key, key: key}}
        resources:
          requests: {memory: "2Gi", cpu: "1"}
          limits: {memory: "4Gi", cpu: "2"}
---
apiVersion: v1
kind: Service
metadata: {name: my-agent}
spec:
  type: LoadBalancer
  ports: [{port: 80, targetPort: 8080}]
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata: {name: my-agent-hpa}
spec:
  scaleTargetRef: {kind: Deployment, name: my-agent}
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Resource
    resource: {name: cpu, target: {type: Utilization, averageUtilization: 70}}
```

```bash
gcloud container clusters create my-cluster --region us-central1 --num-nodes 3
gcloud container clusters get-credentials my-cluster --region us-central1
kubectl create secret generic google-api-key --from-literal=key=$GOOGLE_API_KEY
kubectl apply -f deployment.yaml
```

## Best Practices

```python
# config.py
import os
from dataclasses import dataclass

@dataclass
class Config:
  model_id: str = os.getenv('MODEL_ID', 'gemini-2.5-flash')
  api_key: str = os.getenv('GOOGLE_API_KEY')
  log_level: str = os.getenv('LOG_LEVEL', 'INFO')

# Health checks
@app.get('/health')
async def health(): return {'status': 'healthy'}

# Logging
from google.cloud import logging
client = logging.Client()
client.setup_logging()

# Rate limiting
from slowapi import Limiter
limiter = Limiter(key_func=get_remote_address)

@app.post('/chat')
@limiter.limit('10/minute')
async def chat(request: Request, prompt: str):
  return {'response': (await agent.run(prompt)).text}
```