Assumes you set up your AVCaptureAudioDataOutput something like this (the key assumption is the bit depth):

let audioOutput = AVCaptureAudioDataOutput()
audioOutput.audioSettings =  [
    AVFormatIDKey: kAudioFormatLinearPCM,
    AVNumberOfChannelsKey: 1,
    AVSampleRateKey: 16000.0,

    AVLinearPCMBitDepthKey: 16,
    AVLinearPCMIsBigEndianKey: false,
    AVLinearPCMIsFloatKey: false,
    AVLinearPCMIsNonInterleaved: false

This is the setSampleBufferDelegate delegate:

func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
    let byteCount = sampleBuffer.numSamples * sampleBuffer.sampleSize(at: 0)

    do {
        let unsafeRawPointer = UnsafeMutableRawPointer.allocate(byteCount: byteCount, alignment: 0)
        let pcmBufferPointer = UnsafeMutablePointer<AudioBufferList>.allocate(capacity: 1)
        pcmBufferPointer.initialize(to: AudioBufferList(mNumberBuffers: 1, mBuffers: AudioBuffer(mNumberChannels: 1, mDataByteSize: UInt32(byteCount), mData: unsafeRawPointer)))

        try sampleBuffer.copyPCMData(fromRange: 0..<sampleBuffer.numSamples, into: pcmBufferPointer)

        let data = Data(bytes: unsafeRawPointer, count: byteCount)
        // Do something with the data

    } catch {
        logE("Error converting buffer: \(error.localizedDescription)")

You will likely not want to allocate the buffer on each callback.