From 10a9cce855fbf383a8f0ea5511526777062a03c4 Mon Sep 17 00:00:00 2001 From: eZio Pan Date: Tue, 19 Mar 2024 20:09:36 +0800 Subject: [PATCH] stm32 CORDIC: DMA for q1.31 and q1.15 --- embassy-stm32/src/cordic/mod.rs | 264 ++++++++++++++++++++++-------- embassy-stm32/src/cordic/utils.rs | 13 ++ 2 files changed, 209 insertions(+), 68 deletions(-) diff --git a/embassy-stm32/src/cordic/mod.rs b/embassy-stm32/src/cordic/mod.rs index 9875d73bb..a4b98a770 100644 --- a/embassy-stm32/src/cordic/mod.rs +++ b/embassy-stm32/src/cordic/mod.rs @@ -1,5 +1,6 @@ //! CORDIC co-processor +use embassy_hal_internal::drop::OnDrop; use embassy_hal_internal::{into_ref, Peripheral, PeripheralRef}; use crate::{dma, peripherals}; @@ -100,9 +101,6 @@ impl<'d, T: Instance> Cordic<'d, T> { warn!("At least 1 result hasn't been read, reconfigure will cause DATA LOST"); }; - self.peri.disable_write_dma(); - self.peri.disable_read_dma(); - // clean RRDY flag while self.peri.ready_to_read() { self.peri.read_result(); @@ -115,22 +113,6 @@ impl<'d, T: Instance> Cordic<'d, T> { // we don't set NRES in here, but to make sure NRES is set each time user call "calc"-ish functions, // since each "calc"-ish functions can have different ARGSIZE and RESSIZE, thus NRES should be change accrodingly. } - - fn blocking_read_f32(&mut self) -> (f32, Option) { - let reg_value = self.peri.read_result(); - - let res1 = utils::q1_15_to_f32((reg_value & ((1u32 << 16) - 1)) as u16); - - // We don't care about whether the function return 1 or 2 results, - // the only thing matter is whether user want 1 or 2 results. - let res2 = if !self.config.first_result { - Some(utils::q1_15_to_f32((reg_value >> 16) as u16)) - } else { - None - }; - - (res1, res2) - } } impl<'d, T: Instance> Drop for Cordic<'d, T> { @@ -141,7 +123,7 @@ impl<'d, T: Instance> Drop for Cordic<'d, T> { // q1.31 related impl<'d, T: Instance> Cordic<'d, T> { - /// Run a blocking CORDIC calculation + /// Run a blocking CORDIC calculation in q1.31 format pub fn blocking_calc_32bit(&mut self, arg1s: &[f64], arg2s: Option<&[f64]>, output: &mut [f64]) -> usize { if arg1s.is_empty() { return 0; @@ -157,9 +139,6 @@ impl<'d, T: Instance> Cordic<'d, T> { self.check_input_f64(arg1s, arg2s); - self.peri.disable_write_dma(); - self.peri.disable_read_dma(); - self.peri.set_result_count(if self.config.first_result { Count::One } else { @@ -172,7 +151,10 @@ impl<'d, T: Instance> Cordic<'d, T> { let mut consumed_input_len = 0; - // put double input into cordic + // + // handle 2 input args calculation + // + if arg2s.is_some() && !arg2s.expect("It's infailable").is_empty() { let arg2s = arg2s.expect("It's infailable"); @@ -202,7 +184,10 @@ impl<'d, T: Instance> Cordic<'d, T> { self.blocking_read_f64_to_buf(output, &mut output_count); } - // put single input into cordic + // + // handle 1 input arg calculation + // + let input_left = &arg1s[consumed_input_len..]; if !input_left.is_empty() { @@ -225,27 +210,14 @@ impl<'d, T: Instance> Cordic<'d, T> { output_count } - fn blocking_read_f64(&mut self) -> (f64, Option) { - let res1 = utils::q1_31_to_f64(self.peri.read_result()); + fn blocking_read_f64_to_buf(&mut self, result_buf: &mut [f64], result_index: &mut usize) { + result_buf[*result_index] = utils::q1_31_to_f64(self.peri.read_result()); + *result_index += 1; // We don't care about whether the function return 1 or 2 results, // the only thing matter is whether user want 1 or 2 results. - let res2 = if !self.config.first_result { - Some(utils::q1_31_to_f64(self.peri.read_result())) - } else { - None - }; - - (res1, res2) - } - - fn blocking_read_f64_to_buf(&mut self, result_buf: &mut [f64], result_index: &mut usize) { - let (res1, res2) = self.blocking_read_f64(); - result_buf[*result_index] = res1; - *result_index += 1; - - if let Some(res2) = res2 { - result_buf[*result_index] = res2; + if !self.config.first_result { + result_buf[*result_index] = utils::q1_31_to_f64(self.peri.read_result()); *result_index += 1; } } @@ -254,7 +226,7 @@ impl<'d, T: Instance> Cordic<'d, T> { self.peri.write_argument(utils::f64_to_q1_31(arg)); } - /// Run a async CORDIC calculation + /// Run a async CORDIC calculation in q.1.31 format pub async fn async_calc_32bit( &mut self, write_dma: impl Peripheral

>, @@ -292,8 +264,9 @@ impl<'d, T: Instance> Cordic<'d, T> { let mut input_buf = [0u32; INPUT_BUF_MAX_LEN]; let mut input_buf_len = 0; - self.peri.enable_write_dma(); - self.peri.enable_read_dma(); + // + // handle 2 input args calculation + // if !arg2s.unwrap_or_default().is_empty() { let arg2s = arg2s.expect("It's infailable"); @@ -311,7 +284,7 @@ impl<'d, T: Instance> Cordic<'d, T> { } if input_buf_len == INPUT_BUF_MAX_LEN { - self.dma_calc_32bit( + self.inner_dma_calc_32bit( &mut write_dma, &mut read_dma, true, @@ -325,12 +298,8 @@ impl<'d, T: Instance> Cordic<'d, T> { } } - if input_buf_len % 2 != 0 { - panic!("input buf len should be multiple of 2 in double mode") - } - if input_buf_len > 0 { - self.dma_calc_32bit( + self.inner_dma_calc_32bit( &mut write_dma, &mut read_dma, true, @@ -344,7 +313,9 @@ impl<'d, T: Instance> Cordic<'d, T> { } } - // single input + // + // handle 1 input arg calculation + // if arg1s.len() > consumed_input_len { let input_remain = &arg1s[consumed_input_len..]; @@ -356,7 +327,7 @@ impl<'d, T: Instance> Cordic<'d, T> { input_buf_len += 1; if input_buf_len == INPUT_BUF_MAX_LEN { - self.dma_calc_32bit( + self.inner_dma_calc_32bit( &mut write_dma, &mut read_dma, false, @@ -371,7 +342,7 @@ impl<'d, T: Instance> Cordic<'d, T> { } if input_buf_len > 0 { - self.dma_calc_32bit( + self.inner_dma_calc_32bit( &mut write_dma, &mut read_dma, false, @@ -388,32 +359,47 @@ impl<'d, T: Instance> Cordic<'d, T> { output_count } - async fn dma_calc_32bit( + // this function is highly coupled with async_calc_32bit, and is not intended to use in other place + async fn inner_dma_calc_32bit( &mut self, write_dma: impl Peripheral

>, read_dma: impl Peripheral

>, - double_input: bool, - input_buf: &[u32], - output: &mut [f64], - output_start_index: &mut usize, + double_input: bool, // gether extra info to calc output_buf size + input_buf: &[u32], // input_buf, its content should be extact values and length for calculation + output: &mut [f64], // caller uses should this as a final output array + output_start_index: &mut usize, // the index of start point of the output for this round of calculation ) { into_ref!(write_dma, read_dma); let write_req = write_dma.request(); let read_req = read_dma.request(); - let mut output_buf = [0u32; INPUT_BUF_MAX_LEN * 2]; // make output_buf long enough + // output_buf is the place to store raw value from CORDIC (via DMA). + // For buf size, we assume in this round of calculation: + // all input is 1 arg, and all calculation need 2 output, + // thus output_buf will always be long enough. + let mut output_buf = [0u32; INPUT_BUF_MAX_LEN * 2]; let mut output_buf_size = input_buf.len(); if !self.config.first_result { + // if we need 2 result for 1 input, then output_buf length should be 2x long. output_buf_size *= 2; }; if double_input { + // if input itself is 2 args for 1 calculation, then output_buf length should be /2. output_buf_size /= 2; } let active_output_buf = &mut output_buf[..output_buf_size]; + self.peri.enable_write_dma(); + self.peri.enable_read_dma(); + + let on_drop = OnDrop::new(|| { + self.peri.disable_write_dma(); + self.peri.disable_read_dma(); + }); + unsafe { let write_transfer = dma::Transfer::new_write( &mut write_dma, @@ -434,6 +420,8 @@ impl<'d, T: Instance> Cordic<'d, T> { embassy_futures::join::join(write_transfer, read_transfer).await; } + drop(on_drop); + for &mut output_u32 in active_output_buf { output[*output_start_index] = utils::q1_31_to_f64(output_u32); *output_start_index += 1; @@ -443,7 +431,7 @@ impl<'d, T: Instance> Cordic<'d, T> { // q1.15 related impl<'d, T: Instance> Cordic<'d, T> { - /// Run a CORDIC calculation + /// Run a blocking CORDIC calculation in q1.15 format pub fn blocking_calc_16bit(&mut self, arg1s: &[f32], arg2s: Option<&[f32]>, output: &mut [f32]) -> usize { if arg1s.is_empty() { return 0; @@ -459,9 +447,6 @@ impl<'d, T: Instance> Cordic<'d, T> { self.check_input_f32(arg1s, arg2s); - self.peri.disable_write_dma(); - self.peri.disable_read_dma(); - // In q1.15 mode, 1 write/read to access 2 arguments/results self.peri.set_argument_count(Count::One); self.peri.set_result_count(Count::One); @@ -506,20 +491,163 @@ impl<'d, T: Instance> Cordic<'d, T> { } fn blocking_write_f32(&mut self, arg1: f32, arg2: f32) { - let reg_value: u32 = utils::f32_to_q1_15(arg1) as u32 + ((utils::f32_to_q1_15(arg2) as u32) << 16); + let reg_value: u32 = utils::f32_args_to_u32(arg1, arg2); self.peri.write_argument(reg_value); } fn blocking_read_f32_to_buf(&mut self, result_buf: &mut [f32], result_index: &mut usize) { - let (res1, res2) = self.blocking_read_f32(); + let reg_value = self.peri.read_result(); + + let (res1, res2) = utils::u32_to_f32_res(reg_value); + result_buf[*result_index] = res1; *result_index += 1; - if let Some(res2) = res2 { + // We don't care about whether the function return 1 or 2 results, + // the only thing matter is whether user want 1 or 2 results. + if !self.config.first_result { result_buf[*result_index] = res2; *result_index += 1; } } + + /// Run a async CORDIC calculation in q1.15 format + pub async fn async_calc_16bit( + &mut self, + write_dma: impl Peripheral

>, + read_dma: impl Peripheral

>, + arg1s: &[f32], + arg2s: Option<&[f32]>, + output: &mut [f32], + ) -> usize { + if arg1s.is_empty() { + return 0; + } + + assert!( + match self.config.first_result { + true => output.len() >= arg1s.len(), + false => output.len() >= 2 * arg1s.len(), + }, + "Output buf length is not long enough" + ); + + self.check_input_f32(arg1s, arg2s); + + into_ref!(write_dma, read_dma); + + // In q1.15 mode, 1 write/read to access 2 arguments/results + self.peri.set_argument_count(Count::One); + self.peri.set_result_count(Count::One); + + self.peri.set_data_width(Width::Bits16, Width::Bits16); + + let mut output_count = 0; + let mut input_buf = [0u32; INPUT_BUF_MAX_LEN]; + let mut input_buf_len = 0; + + // In q1.15 mode, we always fill 1 pair of 16bit value into WDATA register. + // If arg2s is None or empty array, we assume arg2 value always 1.0 (as reset value for ARG2). + // If arg2s has some value, and but not as long as arg1s, + // we fill the reset of arg2 values with last value from arg2s (as q1.31 version does) + + let arg2_default_value = match arg2s { + Some(arg2s) if !arg2s.is_empty() => arg2s[arg2s.len() - 1], + _ => 1.0, + }; + + let args = arg1s.iter().zip( + arg2s + .unwrap_or(&[]) + .iter() + .chain(core::iter::repeat(&arg2_default_value)), + ); + + for (&arg1, &arg2) in args { + input_buf[input_buf_len] = utils::f32_args_to_u32(arg1, arg2); + input_buf_len += 1; + + if input_buf_len == INPUT_BUF_MAX_LEN { + self.inner_dma_calc_16bit(&mut write_dma, &mut read_dma, &input_buf, output, &mut output_count) + .await; + } + } + + if input_buf_len > 0 { + self.inner_dma_calc_16bit( + &mut write_dma, + &mut read_dma, + &input_buf[..input_buf_len], + output, + &mut output_count, + ) + .await; + } + + output_count + } + + // this function is highly coupled with async_calc_16bit, and is not intended to use in other place + async fn inner_dma_calc_16bit( + &mut self, + write_dma: impl Peripheral

>, + read_dma: impl Peripheral

>, + input_buf: &[u32], // input_buf, its content should be extact values and length for calculation + output: &mut [f32], // caller uses should this as a final output array + output_start_index: &mut usize, // the index of start point of the output for this round of calculation + ) { + into_ref!(write_dma, read_dma); + + let write_req = write_dma.request(); + let read_req = read_dma.request(); + + // output_buf is the place to store raw value from CORDIC (via DMA). + let mut output_buf = [0u32; INPUT_BUF_MAX_LEN]; + + let active_output_buf = &mut output_buf[..input_buf.len()]; + + self.peri.enable_write_dma(); + self.peri.enable_read_dma(); + + let on_drop = OnDrop::new(|| { + self.peri.disable_write_dma(); + self.peri.disable_read_dma(); + }); + + unsafe { + let write_transfer = dma::Transfer::new_write( + &mut write_dma, + write_req, + input_buf, + T::regs().wdata().as_ptr() as *mut _, + Default::default(), + ); + + let read_transfer = dma::Transfer::new_read( + &mut read_dma, + read_req, + T::regs().rdata().as_ptr() as *mut _, + active_output_buf, + Default::default(), + ); + + embassy_futures::join::join(write_transfer, read_transfer).await; + } + + drop(on_drop); + + for &mut output_u32 in active_output_buf { + let (res1, res2) = utils::u32_to_f32_res(output_u32); + + output[*output_start_index] = res1; + *output_start_index += 1; + + if !self.config.first_result { + output[*output_start_index] = res2; + *output_start_index += 1; + } + } + } } // check input value ARG1, ARG2, SCALE and FUNCTION are compatible with each other diff --git a/embassy-stm32/src/cordic/utils.rs b/embassy-stm32/src/cordic/utils.rs index 2f4b5c5e8..79bef6b97 100644 --- a/embassy-stm32/src/cordic/utils.rs +++ b/embassy-stm32/src/cordic/utils.rs @@ -57,3 +57,16 @@ floating_fixed_convert!( 15, 0x3800_0000u32 // binary form of 1f32^(-15) ); + +#[inline(always)] +pub(crate) fn f32_args_to_u32(arg1: f32, arg2: f32) -> u32 { + f32_to_q1_15(arg1) as u32 + ((f32_to_q1_15(arg2) as u32) << 16) +} + +#[inline(always)] +pub(crate) fn u32_to_f32_res(reg_value: u32) -> (f32, f32) { + let res1 = q1_15_to_f32((reg_value & ((1u32 << 16) - 1)) as u16); + let res2 = q1_15_to_f32((reg_value >> 16) as u16); + + (res1, res2) +}