API monitoring tracks the availability, performance, and correctness of APIs, while observability provides deeper insights into system behavior through metrics, logs, and traces. Together, they form the foundation for maintaining reliable API services.
Key differences:
# Example availability check with Python
import requests
from datetime import datetime
def check_api_availability(url):
try:
start = datetime.now()
response = requests.get(url, timeout=5)
end = datetime.now()
return {
'timestamp': start.isoformat(),
'status_code': response.status_code,
'response_time_ms': (end - start).total_seconds() * 1000,
'available': response.status_code == 200
}
except Exception as e:
return {
'timestamp': datetime.now().isoformat(),
'error': str(e),
'available': False
}
// Node.js example with OpenTelemetry
const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node');
const { SimpleSpanProcessor } = require('@opentelemetry/sdk-trace-base');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
const provider = new NodeTracerProvider();
provider.addSpanProcessor(
new SimpleSpanProcessor(
new JaegerExporter({
endpoint: 'http://jaeger-collector:14268/api/traces'
})
)
);
provider.register();
const httpInstrumentation = new HttpInstrumentation();
httpInstrumentation.setTracerProvider(provider);
// Go example with structured logging
package main
import (
"log/slog"
"net/http"
"os"
"time"
)
func main() {
logger := slog.New(slog.NewJSONHandler(os.Stdout, nil))
http.HandleFunc("/api", func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// API logic here
logger.Info("API request",
"method", r.Method,
"path", r.URL.Path,
"duration_ms", time.Since(start).Milliseconds(),
"status", http.StatusOK,
"user_agent", r.UserAgent(),
)
})
http.ListenAndServe(":8080", nil)
}
# prometheus.rules.yml
groups:
- name: api-alerts
rules:
- alert: HighErrorRate
expr: sum(rate(http_requests_total{status=~"5.."}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service) > 0.05
for: 10m
labels:
severity: critical
annotations:
summary: "High error rate on {{ $labels.service }}"
description: "{{ $value }}% of requests are failing"
# Synthetic test with Locust
from locust import HttpUser, task, between
class ApiUser(HttpUser):
wait_time = between(1, 5)
@task
def get_resource(self):
with self.client.get("/api/resource/123", catch_response=True) as response:
if response.status_code != 200:
response.failure(f"Unexpected status: {response.status_code}")
if response.json().get("data") is None:
response.failure("Missing data field")
Error Budget = (100% - SLO) * Time Period
Example for 99.9% monthly SLO:
Error Budget = 0.1% * 30 days = 43.2 minutes
-- BigQuery example for SLO tracking
SELECT
DATE(timestamp) as day,
COUNT(*) as total_requests,
SUM(CASE WHEN status_code BETWEEN 200 AND 299 THEN 1 ELSE 0 END) as success_count,
SUM(CASE WHEN status_code BETWEEN 200 AND 299 THEN 0 ELSE 1 END) as error_count,
(SUM(CASE WHEN status_code BETWEEN 200 AND 299 THEN 1 ELSE 0 END) / COUNT(*)) as availability
FROM api_requests
WHERE timestamp >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 30 DAY)
GROUP BY day
ORDER BY day
// Spring Boot correlation ID example
@Configuration
public class CorrelationConfig implements WebMvcConfigurer {
@Bean
public Filter correlationIdFilter() {
return new OncePerRequestFilter() {
@Override
protected void doFilterInternal(HttpServletRequest request,
HttpServletResponse response,
FilterChain filterChain) {
String correlationId = request.getHeader("X-Correlation-ID");
if (correlationId == null) {
correlationId = UUID.randomUUID().toString();
}
MDC.put("correlationId", correlationId);
response.setHeader("X-Correlation-ID", correlationId);
try {
filterChain.doFilter(request, response);
} finally {
MDC.remove("correlationId");
}
}
};
}
}
// Contract test example with Pact
const { Pact } = require('@pact-foundation/pact');
const { eachLike, like } = require('@pact-foundation/pact').Matchers;
describe("API Contract", () => {
const provider = new Pact({
consumer: "WebApp",
provider: "UserService",
});
before(() => provider.setup());
after(() => provider.finalize());
describe("GET /user/{id}", () => {
before(() => {
return provider.addInteraction({
state: 'user exists',
uponReceiving: 'a request for user data',
withRequest: {
method: 'GET',
path: '/user/123'
},
willRespondWith: {
status: 200,
headers: { 'Content-Type': 'application/json' },
body: {
id: like(123),
name: like('John Doe'),
email: like('john@example.com')
}
}
});
});
it("should verify the contract", () => {
// Test implementation
});
});
});
<!-- Browser RUM implementation -->
<script>
window.API_MONITORING = {
track: function(apiCall) {
const start = performance.now();
return fetch(apiCall.url, apiCall.options)
.then(response => {
const duration = performance.now() - start;
navigator.sendBeacon('/monitoring', JSON.stringify({
api: apiCall.url,
method: apiCall.options.method || 'GET',
status: response.status,
duration: duration,
timestamp: new Date().toISOString()
}));
return response;
});
}
};
// Usage:
API_MONITORING.track({
url: '/api/data',
options: { method: 'POST' }
});
</script>
# Python anomaly detection with Prophet
from prophet import Prophet
import pandas as pd
def detect_anomalies(metrics_data):
df = pd.DataFrame(metrics_data)
df.columns = ['ds', 'y']
model = Prophet(interval_width=0.99)
model.fit(df)
future = model.make_future_dataframe(periods=0)
forecast = model.predict(future)
merged = df.merge(forecast, on='ds')
anomalies = merged[(merged['y'] > merged['yhat_upper']) |
(merged['y'] < merged['yhat_lower'])]
return anomalies[['ds', 'y', 'yhat_lower', 'yhat_upper']]
For production systems, consider integrating these practices into your CI/CD pipeline to catch issues before they reach production. Start with basic availability monitoring and gradually add more sophisticated observability features as your needs evolve.