feat(outbox): implement concurrent claim mechanism with UPDATE RETURNING + SKIP LOCKED

- Add migration 0004 to introduce 'claiming' status and timeout index
- Add StatusClaiming to platformevent domain and allow it in Validate()
- Rewrite ListDue as transactional UPDATE ... RETURNING with FOR UPDATE SKIP LOCKED
- Add ReleaseStaleClaims to reset expired claiming events back to retrying
- Worker Start() now runs a 30s ticker for stale claim recovery (5m timeout)
- Update stubEventStore in tests to satisfy new EventStore interface

Refs: D-02
This commit is contained in:
Your Name
2026-05-11 13:16:28 +08:00
parent 771304eabe
commit 34b175b130
5 changed files with 86 additions and 9 deletions

View File

@@ -75,14 +75,30 @@ func (s *PlatformEventStore) ListDue(ctx context.Context, platform string, dueBe
if platform == "" {
return nil, fmt.Errorf("platform is required")
}
rows, err := s.db.QueryContext(ctx, `
SELECT id, platform, event_type, COALESCE(session_id::text, ''), COALESCE(ticket_id::text, ''), COALESCE(source_message_id, ''),
payload, status, attempt_count, next_attempt_at, occurred_at, created_at, updated_at,
delivered_at, COALESCE(last_error, '')
FROM cs_platform_event_outbox
WHERE platform = $1 AND status IN ('pending', 'retrying') AND next_attempt_at <= $2
ORDER BY next_attempt_at ASC, occurred_at ASC, created_at ASC, id ASC
LIMIT $3
tx, err := s.db.BeginTx(ctx, nil)
if err != nil {
return nil, err
}
defer func() {
if err != nil {
_ = tx.Rollback()
}
}()
rows, err := tx.QueryContext(ctx, `
UPDATE cs_platform_event_outbox
SET status = 'claiming', updated_at = NOW()
WHERE id IN (
SELECT id FROM cs_platform_event_outbox
WHERE platform = $1 AND status IN ('pending','retrying') AND next_attempt_at <= $2
ORDER BY next_attempt_at ASC, occurred_at ASC, created_at ASC, id ASC
LIMIT $3
FOR UPDATE SKIP LOCKED
)
RETURNING id, platform, event_type, COALESCE(session_id::text, ''), COALESCE(ticket_id::text, ''), COALESCE(source_message_id, ''),
payload, status, attempt_count, next_attempt_at, occurred_at, created_at, updated_at,
delivered_at, COALESCE(last_error, '')
`, platform, dueBefore, limit)
if err != nil {
return nil, err
@@ -126,9 +142,32 @@ func (s *PlatformEventStore) ListDue(ctx context.Context, platform string, dueBe
if err := rows.Err(); err != nil {
return nil, err
}
if err := tx.Commit(); err != nil {
return nil, err
}
return events, nil
}
func (s *PlatformEventStore) ReleaseStaleClaims(ctx context.Context, timeout time.Duration) (int, error) {
if s.db == nil {
return 0, fmt.Errorf("db is nil")
}
res, err := s.db.ExecContext(ctx, `
UPDATE cs_platform_event_outbox
SET status = 'retrying', updated_at = NOW()
WHERE status = 'claiming' AND updated_at < NOW() - $1::interval
`, timeout.Seconds())
if err != nil {
return 0, err
}
n, err := res.RowsAffected()
if err != nil {
return 0, err
}
return int(n), nil
}
func (s *PlatformEventStore) MarkDelivered(ctx context.Context, eventID string, deliveredAt time.Time) error {
if s.db == nil {
return fmt.Errorf("db is nil")