| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281 |
- import Accelerate
- import AVFoundation
- import Foundation
- /// Detects the musical key of an audio file using chromagram analysis
- /// with Krumhansl-Kessler key profiles.
- struct KeyDetector {
- // MARK: - Key Profiles (Krumhansl-Kessler)
- /// Major key profile weights for each pitch class (C, C#, D, ..., B).
- private static let majorProfile: [Double] = [
- 6.35, 2.23, 3.48, 2.33, 4.38, 4.09,
- 2.52, 5.19, 2.39, 3.66, 2.29, 2.88
- ]
- /// Minor key profile weights for each pitch class.
- private static let minorProfile: [Double] = [
- 6.33, 2.68, 3.52, 5.38, 2.60, 3.53,
- 2.54, 4.75, 3.98, 2.69, 3.34, 3.17
- ]
- /// Note names for Camelot-compatible display.
- private static let noteNames = ["C", "C#", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B"]
- /// Camelot wheel codes for DJ-friendly key display.
- private static let camelotMajor = ["8B", "3B", "10B", "5B", "12B", "7B", "2B", "9B", "4B", "11B", "6B", "1B"]
- private static let camelotMinor = ["5A", "12A", "7A", "2A", "9A", "4A", "11A", "6A", "1A", "8A", "3A", "10A"]
- // MARK: - Configuration
- private static let fftSize = 4096
- private static let hopSize = 2048
- private static let referenceFrequency: Double = 440.0 // A4
- // MARK: - Result
- struct KeyResult {
- let key: String // e.g., "C Major" or "A Minor"
- let camelotCode: String // e.g., "8B" or "8A"
- let confidence: Double // 0.0 to 1.0
- let rootNote: Int // pitch class index 0-11
- let isMinor: Bool
- var shortKey: String {
- let note = KeyDetector.noteNames[rootNote]
- return "\(note)\(isMinor ? "m" : "")"
- }
- }
- // MARK: - Public API
- static func detectKey(for track: Track) async throws -> KeyResult {
- try await detectKey(fileURL: track.fileURL)
- }
- static func detectKey(fileURL: URL) async throws -> KeyResult {
- try await Task.detached(priority: .userInitiated) {
- let sampleRate: Double
- let samples: [Float]
- if OGGDecoder.isOGGFile(fileURL) {
- let result = try OGGDecoder.readMonoSamples(url: fileURL, maxSeconds: 30)
- sampleRate = result.sampleRate
- samples = result.samples
- } else {
- let audioFile = try AVAudioFile(forReading: fileURL)
- sampleRate = audioFile.processingFormat.sampleRate
- samples = try readMonoSamples(from: audioFile, maxSeconds: 30)
- }
- guard samples.count > fftSize * 2 else {
- throw KeyDetectionError.insufficientAudio
- }
- // Build chromagram
- let chromagram = computeChromagram(samples: samples, sampleRate: sampleRate)
- // Average across time
- let avgChroma = averageChromagram(chromagram)
- // Match against key profiles
- return matchKeyProfile(chroma: avgChroma)
- }.value
- }
- // MARK: - Audio Reading
- private static func readMonoSamples(from audioFile: AVAudioFile, maxSeconds: Double) throws -> [Float] {
- let sampleRate = audioFile.processingFormat.sampleRate
- let maxFrames = AVAudioFrameCount(min(Double(audioFile.length), sampleRate * maxSeconds))
- guard let format = AVAudioFormat(standardFormatWithSampleRate: sampleRate, channels: 1),
- let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: maxFrames) else {
- throw KeyDetectionError.formatError
- }
- audioFile.framePosition = 0
- try audioFile.read(into: buffer, frameCount: maxFrames)
- guard let data = buffer.floatChannelData else {
- throw KeyDetectionError.noAudioData
- }
- return Array(UnsafeBufferPointer(start: data[0], count: Int(buffer.frameLength)))
- }
- // MARK: - Chromagram Computation
- private static func computeChromagram(samples: [Float], sampleRate: Double) -> [[Double]] {
- let halfFFT = fftSize / 2
- let log2n = vDSP_Length(log2(Double(fftSize)))
- guard let fftSetup = vDSP_create_fftsetup(log2n, FFTRadix(kFFTRadix2)) else { return [] }
- defer { vDSP_destroy_fftsetup(fftSetup) }
- let numFrames = (samples.count - fftSize) / hopSize + 1
- var chromagram = [[Double]]()
- chromagram.reserveCapacity(numFrames)
- var window = [Float](repeating: 0, count: fftSize)
- vDSP_hann_window(&window, vDSP_Length(fftSize), Int32(vDSP_HANN_NORM))
- // Pre-compute frequency-to-chroma mapping
- let chromaMap = buildChromaMap(fftSize: fftSize, sampleRate: sampleRate)
- var real = [Float](repeating: 0, count: halfFFT)
- var imag = [Float](repeating: 0, count: halfFFT)
- for frameIndex in 0..<numFrames {
- let offset = frameIndex * hopSize
- let end = offset + fftSize
- guard end <= samples.count else { break }
- var frame = Array(samples[offset..<end])
- vDSP_vmul(frame, 1, window, 1, &frame, 1, vDSP_Length(fftSize))
- // FFT
- frame.withUnsafeMutableBufferPointer { framePtr in
- framePtr.baseAddress!.withMemoryRebound(to: DSPComplex.self, capacity: halfFFT) { complexPtr in
- var splitComplex = DSPSplitComplex(realp: &real, imagp: &imag)
- vDSP_ctoz(complexPtr, 2, &splitComplex, 1, vDSP_Length(halfFFT))
- }
- }
- var splitComplex = DSPSplitComplex(realp: &real, imagp: &imag)
- vDSP_fft_zrip(fftSetup, &splitComplex, 1, log2n, FFTDirection(kFFTDirection_Forward))
- // Magnitudes
- var magnitudes = [Float](repeating: 0, count: halfFFT)
- vDSP_zvmags(&splitComplex, 1, &magnitudes, 1, vDSP_Length(halfFFT))
- // Map to 12 chroma bins
- var chroma = [Double](repeating: 0, count: 12)
- for bin in 1..<halfFFT {
- let chromaBin = chromaMap[bin]
- if chromaBin >= 0 {
- chroma[chromaBin] += Double(magnitudes[bin])
- }
- }
- chromagram.append(chroma)
- }
- return chromagram
- }
- /// Pre-compute which FFT bin maps to which chroma pitch class.
- private static func buildChromaMap(fftSize: Int, sampleRate: Double) -> [Int] {
- let halfFFT = fftSize / 2
- var map = [Int](repeating: -1, count: halfFFT)
- for bin in 1..<halfFFT {
- let frequency = Double(bin) * sampleRate / Double(fftSize)
- // Only consider musically relevant frequencies (30 Hz to 5000 Hz)
- guard frequency >= 30 && frequency <= 5000 else { continue }
- // Convert frequency to pitch class
- let semitones = 12.0 * log2(frequency / referenceFrequency)
- let pitchClass = ((Int(round(semitones)) % 12) + 12 + 9) % 12 // A = 9, so shift to C = 0
- map[bin] = pitchClass
- }
- return map
- }
- // MARK: - Average Chromagram
- private static func averageChromagram(_ chromagram: [[Double]]) -> [Double] {
- guard !chromagram.isEmpty else { return [Double](repeating: 0, count: 12) }
- var avg = [Double](repeating: 0, count: 12)
- for frame in chromagram {
- for i in 0..<12 {
- avg[i] += frame[i]
- }
- }
- let count = Double(chromagram.count)
- for i in 0..<12 {
- avg[i] /= count
- }
- return avg
- }
- // MARK: - Key Profile Matching
- private static func matchKeyProfile(chroma: [Double]) -> KeyResult {
- var bestCorrelation = -Double.greatestFiniteMagnitude
- var bestRoot = 0
- var bestIsMinor = false
- for root in 0..<12 {
- // Rotate chroma so 'root' aligns with index 0
- let rotated = rotateChroma(chroma, by: root)
- // Correlate with major profile
- let majorCorr = pearsonCorrelation(rotated, majorProfile)
- if majorCorr > bestCorrelation {
- bestCorrelation = majorCorr
- bestRoot = root
- bestIsMinor = false
- }
- // Correlate with minor profile
- let minorCorr = pearsonCorrelation(rotated, minorProfile)
- if minorCorr > bestCorrelation {
- bestCorrelation = minorCorr
- bestRoot = root
- bestIsMinor = true
- }
- }
- let confidence = max(0, min(1, (bestCorrelation + 1) / 2))
- let keyName = "\(noteNames[bestRoot]) \(bestIsMinor ? "Minor" : "Major")"
- let camelot = bestIsMinor ? camelotMinor[bestRoot] : camelotMajor[bestRoot]
- return KeyResult(
- key: keyName,
- camelotCode: camelot,
- confidence: confidence,
- rootNote: bestRoot,
- isMinor: bestIsMinor
- )
- }
- private static func rotateChroma(_ chroma: [Double], by amount: Int) -> [Double] {
- let n = chroma.count
- return (0..<n).map { chroma[($0 + amount) % n] }
- }
- private static func pearsonCorrelation(_ a: [Double], _ b: [Double]) -> Double {
- let n = Double(a.count)
- let sumA = a.reduce(0, +)
- let sumB = b.reduce(0, +)
- let sumAB = zip(a, b).map(*).reduce(0, +)
- let sumA2 = a.map { $0 * $0 }.reduce(0, +)
- let sumB2 = b.map { $0 * $0 }.reduce(0, +)
- let numerator = n * sumAB - sumA * sumB
- let denominator = sqrt((n * sumA2 - sumA * sumA) * (n * sumB2 - sumB * sumB))
- guard denominator > 0 else { return 0 }
- return numerator / denominator
- }
- }
- // MARK: - Errors
- enum KeyDetectionError: Error, LocalizedError {
- case insufficientAudio
- case formatError
- case noAudioData
- var errorDescription: String? {
- switch self {
- case .insufficientAudio: return "Audio file is too short for key detection"
- case .formatError: return "Unable to read audio format"
- case .noAudioData: return "No audio data found"
- }
- }
- }
|