|
1 | NO CONTENT: new file 10644 |
This diff has been collapsed as it changes many lines, (704 lines changed) Show them Hide them | |||
@@ -0,0 +1,704 | |||
|
1 | Dynamic DMA mapping using the generic device | |
|
2 | ============================================ | |
|
3 | ||
|
4 | James E.J. Bottomley <James.Bottomley@HansenPartnership.com> | |
|
5 | ||
|
6 | This document describes the DMA API. For a more gentle introduction | |
|
7 | of the API (and actual examples), see Documentation/DMA-API-HOWTO.txt. | |
|
8 | ||
|
9 | This API is split into two pieces. Part I describes the basic API. | |
|
10 | Part II describes extensions for supporting non-consistent memory | |
|
11 | machines. Unless you know that your driver absolutely has to support | |
|
12 | non-consistent platforms (this is usually only legacy platforms) you | |
|
13 | should only use the API described in part I. | |
|
14 | ||
|
15 | Part I - dma_ API | |
|
16 | ------------------------------------- | |
|
17 | ||
|
18 | To get the dma_ API, you must #include <linux/dma-mapping.h>. This | |
|
19 | provides dma_addr_t and the interfaces described below. | |
|
20 | ||
|
21 | A dma_addr_t can hold any valid DMA or bus address for the platform. It | |
|
22 | can be given to a device to use as a DMA source or target. A CPU cannot | |
|
23 | reference a dma_addr_t directly because there may be translation between | |
|
24 | its physical address space and the bus address space. | |
|
25 | ||
|
26 | Part Ia - Using large DMA-coherent buffers | |
|
27 | ------------------------------------------ | |
|
28 | ||
|
29 | void * | |
|
30 | dma_alloc_coherent(struct device *dev, size_t size, | |
|
31 | dma_addr_t *dma_handle, gfp_t flag) | |
|
32 | ||
|
33 | Consistent memory is memory for which a write by either the device or | |
|
34 | the processor can immediately be read by the processor or device | |
|
35 | without having to worry about caching effects. (You may however need | |
|
36 | to make sure to flush the processor's write buffers before telling | |
|
37 | devices to read that memory.) | |
|
38 | ||
|
39 | This routine allocates a region of <size> bytes of consistent memory. | |
|
40 | ||
|
41 | It returns a pointer to the allocated region (in the processor's virtual | |
|
42 | address space) or NULL if the allocation failed. | |
|
43 | ||
|
44 | It also returns a <dma_handle> which may be cast to an unsigned integer the | |
|
45 | same width as the bus and given to the device as the bus address base of | |
|
46 | the region. | |
|
47 | ||
|
48 | Note: consistent memory can be expensive on some platforms, and the | |
|
49 | minimum allocation length may be as big as a page, so you should | |
|
50 | consolidate your requests for consistent memory as much as possible. | |
|
51 | The simplest way to do that is to use the dma_pool calls (see below). | |
|
52 | ||
|
53 | The flag parameter (dma_alloc_coherent() only) allows the caller to | |
|
54 | specify the GFP_ flags (see kmalloc()) for the allocation (the | |
|
55 | implementation may choose to ignore flags that affect the location of | |
|
56 | the returned memory, like GFP_DMA). | |
|
57 | ||
|
58 | void * | |
|
59 | dma_zalloc_coherent(struct device *dev, size_t size, | |
|
60 | dma_addr_t *dma_handle, gfp_t flag) | |
|
61 | ||
|
62 | Wraps dma_alloc_coherent() and also zeroes the returned memory if the | |
|
63 | allocation attempt succeeded. | |
|
64 | ||
|
65 | void | |
|
66 | dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, | |
|
67 | dma_addr_t dma_handle) | |
|
68 | ||
|
69 | Free a region of consistent memory you previously allocated. dev, | |
|
70 | size and dma_handle must all be the same as those passed into | |
|
71 | dma_alloc_coherent(). cpu_addr must be the virtual address returned by | |
|
72 | the dma_alloc_coherent(). | |
|
73 | ||
|
74 | Note that unlike their sibling allocation calls, these routines | |
|
75 | may only be called with IRQs enabled. | |
|
76 | ||
|
77 | ||
|
78 | Part Ib - Using small DMA-coherent buffers | |
|
79 | ------------------------------------------ | |
|
80 | ||
|
81 | To get this part of the dma_ API, you must #include <linux/dmapool.h> | |
|
82 | ||
|
83 | Many drivers need lots of small DMA-coherent memory regions for DMA | |
|
84 | descriptors or I/O buffers. Rather than allocating in units of a page | |
|
85 | or more using dma_alloc_coherent(), you can use DMA pools. These work | |
|
86 | much like a struct kmem_cache, except that they use the DMA-coherent allocator, | |
|
87 | not __get_free_pages(). Also, they understand common hardware constraints | |
|
88 | for alignment, like queue heads needing to be aligned on N-byte boundaries. | |
|
89 | ||
|
90 | ||
|
91 | struct dma_pool * | |
|
92 | dma_pool_create(const char *name, struct device *dev, | |
|
93 | size_t size, size_t align, size_t alloc); | |
|
94 | ||
|
95 | dma_pool_create() initializes a pool of DMA-coherent buffers | |
|
96 | for use with a given device. It must be called in a context which | |
|
97 | can sleep. | |
|
98 | ||
|
99 | The "name" is for diagnostics (like a struct kmem_cache name); dev and size | |
|
100 | are like what you'd pass to dma_alloc_coherent(). The device's hardware | |
|
101 | alignment requirement for this type of data is "align" (which is expressed | |
|
102 | in bytes, and must be a power of two). If your device has no boundary | |
|
103 | crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated | |
|
104 | from this pool must not cross 4KByte boundaries. | |
|
105 | ||
|
106 | ||
|
107 | void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags, | |
|
108 | dma_addr_t *dma_handle); | |
|
109 | ||
|
110 | This allocates memory from the pool; the returned memory will meet the | |
|
111 | size and alignment requirements specified at creation time. Pass | |
|
112 | GFP_ATOMIC to prevent blocking, or if it's permitted (not | |
|
113 | in_interrupt, not holding SMP locks), pass GFP_KERNEL to allow | |
|
114 | blocking. Like dma_alloc_coherent(), this returns two values: an | |
|
115 | address usable by the CPU, and the DMA address usable by the pool's | |
|
116 | device. | |
|
117 | ||
|
118 | ||
|
119 | void dma_pool_free(struct dma_pool *pool, void *vaddr, | |
|
120 | dma_addr_t addr); | |
|
121 | ||
|
122 | This puts memory back into the pool. The pool is what was passed to | |
|
123 | dma_pool_alloc(); the CPU (vaddr) and DMA addresses are what | |
|
124 | were returned when that routine allocated the memory being freed. | |
|
125 | ||
|
126 | ||
|
127 | void dma_pool_destroy(struct dma_pool *pool); | |
|
128 | ||
|
129 | dma_pool_destroy() frees the resources of the pool. It must be | |
|
130 | called in a context which can sleep. Make sure you've freed all allocated | |
|
131 | memory back to the pool before you destroy it. | |
|
132 | ||
|
133 | ||
|
134 | Part Ic - DMA addressing limitations | |
|
135 | ------------------------------------ | |
|
136 | ||
|
137 | int | |
|
138 | dma_supported(struct device *dev, u64 mask) | |
|
139 | ||
|
140 | Checks to see if the device can support DMA to the memory described by | |
|
141 | mask. | |
|
142 | ||
|
143 | Returns: 1 if it can and 0 if it can't. | |
|
144 | ||
|
145 | Notes: This routine merely tests to see if the mask is possible. It | |
|
146 | won't change the current mask settings. It is more intended as an | |
|
147 | internal API for use by the platform than an external API for use by | |
|
148 | driver writers. | |
|
149 | ||
|
150 | int | |
|
151 | dma_set_mask_and_coherent(struct device *dev, u64 mask) | |
|
152 | ||
|
153 | Checks to see if the mask is possible and updates the device | |
|
154 | streaming and coherent DMA mask parameters if it is. | |
|
155 | ||
|
156 | Returns: 0 if successful and a negative error if not. | |
|
157 | ||
|
158 | int | |
|
159 | dma_set_mask(struct device *dev, u64 mask) | |
|
160 | ||
|
161 | Checks to see if the mask is possible and updates the device | |
|
162 | parameters if it is. | |
|
163 | ||
|
164 | Returns: 0 if successful and a negative error if not. | |
|
165 | ||
|
166 | int | |
|
167 | dma_set_coherent_mask(struct device *dev, u64 mask) | |
|
168 | ||
|
169 | Checks to see if the mask is possible and updates the device | |
|
170 | parameters if it is. | |
|
171 | ||
|
172 | Returns: 0 if successful and a negative error if not. | |
|
173 | ||
|
174 | u64 | |
|
175 | dma_get_required_mask(struct device *dev) | |
|
176 | ||
|
177 | This API returns the mask that the platform requires to | |
|
178 | operate efficiently. Usually this means the returned mask | |
|
179 | is the minimum required to cover all of memory. Examining the | |
|
180 | required mask gives drivers with variable descriptor sizes the | |
|
181 | opportunity to use smaller descriptors as necessary. | |
|
182 | ||
|
183 | Requesting the required mask does not alter the current mask. If you | |
|
184 | wish to take advantage of it, you should issue a dma_set_mask() | |
|
185 | call to set the mask to the value returned. | |
|
186 | ||
|
187 | ||
|
188 | Part Id - Streaming DMA mappings | |
|
189 | -------------------------------- | |
|
190 | ||
|
191 | dma_addr_t | |
|
192 | dma_map_single(struct device *dev, void *cpu_addr, size_t size, | |
|
193 | enum dma_data_direction direction) | |
|
194 | ||
|
195 | Maps a piece of processor virtual memory so it can be accessed by the | |
|
196 | device and returns the bus address of the memory. | |
|
197 | ||
|
198 | The direction for both APIs may be converted freely by casting. | |
|
199 | However the dma_ API uses a strongly typed enumerator for its | |
|
200 | direction: | |
|
201 | ||
|
202 | DMA_NONE no direction (used for debugging) | |
|
203 | DMA_TO_DEVICE data is going from the memory to the device | |
|
204 | DMA_FROM_DEVICE data is coming from the device to the memory | |
|
205 | DMA_BIDIRECTIONAL direction isn't known | |
|
206 | ||
|
207 | Notes: Not all memory regions in a machine can be mapped by this API. | |
|
208 | Further, contiguous kernel virtual space may not be contiguous as | |
|
209 | physical memory. Since this API does not provide any scatter/gather | |
|
210 | capability, it will fail if the user tries to map a non-physically | |
|
211 | contiguous piece of memory. For this reason, memory to be mapped by | |
|
212 | this API should be obtained from sources which guarantee it to be | |
|
213 | physically contiguous (like kmalloc). | |
|
214 | ||
|
215 | Further, the bus address of the memory must be within the | |
|
216 | dma_mask of the device (the dma_mask is a bit mask of the | |
|
217 | addressable region for the device, i.e., if the bus address of | |
|
218 | the memory ANDed with the dma_mask is still equal to the bus | |
|
219 | address, then the device can perform DMA to the memory). To | |
|
220 | ensure that the memory allocated by kmalloc is within the dma_mask, | |
|
221 | the driver may specify various platform-dependent flags to restrict | |
|
222 | the bus address range of the allocation (e.g., on x86, GFP_DMA | |
|
223 | guarantees to be within the first 16MB of available bus addresses, | |
|
224 | as required by ISA devices). | |
|
225 | ||
|
226 | Note also that the above constraints on physical contiguity and | |
|
227 | dma_mask may not apply if the platform has an IOMMU (a device which | |
|
228 | maps an I/O bus address to a physical memory address). However, to be | |
|
229 | portable, device driver writers may *not* assume that such an IOMMU | |
|
230 | exists. | |
|
231 | ||
|
232 | Warnings: Memory coherency operates at a granularity called the cache | |
|
233 | line width. In order for memory mapped by this API to operate | |
|
234 | correctly, the mapped region must begin exactly on a cache line | |
|
235 | boundary and end exactly on one (to prevent two separately mapped | |
|
236 | regions from sharing a single cache line). Since the cache line size | |
|
237 | may not be known at compile time, the API will not enforce this | |
|
238 | requirement. Therefore, it is recommended that driver writers who | |
|
239 | don't take special care to determine the cache line size at run time | |
|
240 | only map virtual regions that begin and end on page boundaries (which | |
|
241 | are guaranteed also to be cache line boundaries). | |
|
242 | ||
|
243 | DMA_TO_DEVICE synchronisation must be done after the last modification | |
|
244 | of the memory region by the software and before it is handed off to | |
|
245 | the driver. Once this primitive is used, memory covered by this | |
|
246 | primitive should be treated as read-only by the device. If the device | |
|
247 | may write to it at any point, it should be DMA_BIDIRECTIONAL (see | |
|
248 | below). | |
|
249 | ||
|
250 | DMA_FROM_DEVICE synchronisation must be done before the driver | |
|
251 | accesses data that may be changed by the device. This memory should | |
|
252 | be treated as read-only by the driver. If the driver needs to write | |
|
253 | to it at any point, it should be DMA_BIDIRECTIONAL (see below). | |
|
254 | ||
|
255 | DMA_BIDIRECTIONAL requires special handling: it means that the driver | |
|
256 | isn't sure if the memory was modified before being handed off to the | |
|
257 | device and also isn't sure if the device will also modify it. Thus, | |
|
258 | you must always sync bidirectional memory twice: once before the | |
|
259 | memory is handed off to the device (to make sure all memory changes | |
|
260 | are flushed from the processor) and once before the data may be | |
|
261 | accessed after being used by the device (to make sure any processor | |
|
262 | cache lines are updated with data that the device may have changed). | |
|
263 | ||
|
264 | void | |
|
265 | dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, | |
|
266 | enum dma_data_direction direction) | |
|
267 | ||
|
268 | Unmaps the region previously mapped. All the parameters passed in | |
|
269 | must be identical to those passed in (and returned) by the mapping | |
|
270 | API. | |
|
271 | ||
|
272 | dma_addr_t | |
|
273 | dma_map_page(struct device *dev, struct page *page, | |
|
274 | unsigned long offset, size_t size, | |
|
275 | enum dma_data_direction direction) | |
|
276 | void | |
|
277 | dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, | |
|
278 | enum dma_data_direction direction) | |
|
279 | ||
|
280 | API for mapping and unmapping for pages. All the notes and warnings | |
|
281 | for the other mapping APIs apply here. Also, although the <offset> | |
|
282 | and <size> parameters are provided to do partial page mapping, it is | |
|
283 | recommended that you never use these unless you really know what the | |
|
284 | cache width is. | |
|
285 | ||
|
286 | int | |
|
287 | dma_mapping_error(struct device *dev, dma_addr_t dma_addr) | |
|
288 | ||
|
289 | In some circumstances dma_map_single() and dma_map_page() will fail to create | |
|
290 | a mapping. A driver can check for these errors by testing the returned | |
|
291 | DMA address with dma_mapping_error(). A non-zero return value means the mapping | |
|
292 | could not be created and the driver should take appropriate action (e.g. | |
|
293 | reduce current DMA mapping usage or delay and try again later). | |
|
294 | ||
|
295 | int | |
|
296 | dma_map_sg(struct device *dev, struct scatterlist *sg, | |
|
297 | int nents, enum dma_data_direction direction) | |
|
298 | ||
|
299 | Returns: the number of bus address segments mapped (this may be shorter | |
|
300 | than <nents> passed in if some elements of the scatter/gather list are | |
|
301 | physically or virtually adjacent and an IOMMU maps them with a single | |
|
302 | entry). | |
|
303 | ||
|
304 | Please note that the sg cannot be mapped again if it has been mapped once. | |
|
305 | The mapping process is allowed to destroy information in the sg. | |
|
306 | ||
|
307 | As with the other mapping interfaces, dma_map_sg() can fail. When it | |
|
308 | does, 0 is returned and a driver must take appropriate action. It is | |
|
309 | critical that the driver do something, in the case of a block driver | |
|
310 | aborting the request or even oopsing is better than doing nothing and | |
|
311 | corrupting the filesystem. | |
|
312 | ||
|
313 | With scatterlists, you use the resulting mapping like this: | |
|
314 | ||
|
315 | int i, count = dma_map_sg(dev, sglist, nents, direction); | |
|
316 | struct scatterlist *sg; | |
|
317 | ||
|
318 | for_each_sg(sglist, sg, count, i) { | |
|
319 | hw_address[i] = sg_dma_address(sg); | |
|
320 | hw_len[i] = sg_dma_len(sg); | |
|
321 | } | |
|
322 | ||
|
323 | where nents is the number of entries in the sglist. | |
|
324 | ||
|
325 | The implementation is free to merge several consecutive sglist entries | |
|
326 | into one (e.g. with an IOMMU, or if several pages just happen to be | |
|
327 | physically contiguous) and returns the actual number of sg entries it | |
|
328 | mapped them to. On failure 0, is returned. | |
|
329 | ||
|
330 | Then you should loop count times (note: this can be less than nents times) | |
|
331 | and use sg_dma_address() and sg_dma_len() macros where you previously | |
|
332 | accessed sg->address and sg->length as shown above. | |
|
333 | ||
|
334 | void | |
|
335 | dma_unmap_sg(struct device *dev, struct scatterlist *sg, | |
|
336 | int nhwentries, enum dma_data_direction direction) | |
|
337 | ||
|
338 | Unmap the previously mapped scatter/gather list. All the parameters | |
|
339 | must be the same as those and passed in to the scatter/gather mapping | |
|
340 | API. | |
|
341 | ||
|
342 | Note: <nents> must be the number you passed in, *not* the number of | |
|
343 | bus address entries returned. | |
|
344 | ||
|
345 | void | |
|
346 | dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, | |
|
347 | enum dma_data_direction direction) | |
|
348 | void | |
|
349 | dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, | |
|
350 | enum dma_data_direction direction) | |
|
351 | void | |
|
352 | dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, | |
|
353 | enum dma_data_direction direction) | |
|
354 | void | |
|
355 | dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, | |
|
356 | enum dma_data_direction direction) | |
|
357 | ||
|
358 | Synchronise a single contiguous or scatter/gather mapping for the CPU | |
|
359 | and device. With the sync_sg API, all the parameters must be the same | |
|
360 | as those passed into the single mapping API. With the sync_single API, | |
|
361 | you can use dma_handle and size parameters that aren't identical to | |
|
362 | those passed into the single mapping API to do a partial sync. | |
|
363 | ||
|
364 | Notes: You must do this: | |
|
365 | ||
|
366 | - Before reading values that have been written by DMA from the device | |
|
367 | (use the DMA_FROM_DEVICE direction) | |
|
368 | - After writing values that will be written to the device using DMA | |
|
369 | (use the DMA_TO_DEVICE) direction | |
|
370 | - before *and* after handing memory to the device if the memory is | |
|
371 | DMA_BIDIRECTIONAL | |
|
372 | ||
|
373 | See also dma_map_single(). | |
|
374 | ||
|
375 | dma_addr_t | |
|
376 | dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size, | |
|
377 | enum dma_data_direction dir, | |
|
378 | struct dma_attrs *attrs) | |
|
379 | ||
|
380 | void | |
|
381 | dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr, | |
|
382 | size_t size, enum dma_data_direction dir, | |
|
383 | struct dma_attrs *attrs) | |
|
384 | ||
|
385 | int | |
|
386 | dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl, | |
|
387 | int nents, enum dma_data_direction dir, | |
|
388 | struct dma_attrs *attrs) | |
|
389 | ||
|
390 | void | |
|
391 | dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl, | |
|
392 | int nents, enum dma_data_direction dir, | |
|
393 | struct dma_attrs *attrs) | |
|
394 | ||
|
395 | The four functions above are just like the counterpart functions | |
|
396 | without the _attrs suffixes, except that they pass an optional | |
|
397 | struct dma_attrs*. | |
|
398 | ||
|
399 | struct dma_attrs encapsulates a set of "DMA attributes". For the | |
|
400 | definition of struct dma_attrs see linux/dma-attrs.h. | |
|
401 | ||
|
402 | The interpretation of DMA attributes is architecture-specific, and | |
|
403 | each attribute should be documented in Documentation/DMA-attributes.txt. | |
|
404 | ||
|
405 | If struct dma_attrs* is NULL, the semantics of each of these | |
|
406 | functions is identical to those of the corresponding function | |
|
407 | without the _attrs suffix. As a result dma_map_single_attrs() | |
|
408 | can generally replace dma_map_single(), etc. | |
|
409 | ||
|
410 | As an example of the use of the *_attrs functions, here's how | |
|
411 | you could pass an attribute DMA_ATTR_FOO when mapping memory | |
|
412 | for DMA: | |
|
413 | ||
|
414 | #include <linux/dma-attrs.h> | |
|
415 | /* DMA_ATTR_FOO should be defined in linux/dma-attrs.h and | |
|
416 | * documented in Documentation/DMA-attributes.txt */ | |
|
417 | ... | |
|
418 | ||
|
419 | DEFINE_DMA_ATTRS(attrs); | |
|
420 | dma_set_attr(DMA_ATTR_FOO, &attrs); | |
|
421 | .... | |
|
422 | n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, &attr); | |
|
423 | .... | |
|
424 | ||
|
425 | Architectures that care about DMA_ATTR_FOO would check for its | |
|
426 | presence in their implementations of the mapping and unmapping | |
|
427 | routines, e.g.: | |
|
428 | ||
|
429 | void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr, | |
|
430 | size_t size, enum dma_data_direction dir, | |
|
431 | struct dma_attrs *attrs) | |
|
432 | { | |
|
433 | .... | |
|
434 | int foo = dma_get_attr(DMA_ATTR_FOO, attrs); | |
|
435 | .... | |
|
436 | if (foo) | |
|
437 | /* twizzle the frobnozzle */ | |
|
438 | .... | |
|
439 | ||
|
440 | ||
|
441 | Part II - Advanced dma_ usage | |
|
442 | ----------------------------- | |
|
443 | ||
|
444 | Warning: These pieces of the DMA API should not be used in the | |
|
445 | majority of cases, since they cater for unlikely corner cases that | |
|
446 | don't belong in usual drivers. | |
|
447 | ||
|
448 | If you don't understand how cache line coherency works between a | |
|
449 | processor and an I/O device, you should not be using this part of the | |
|
450 | API at all. | |
|
451 | ||
|
452 | void * | |
|
453 | dma_alloc_noncoherent(struct device *dev, size_t size, | |
|
454 | dma_addr_t *dma_handle, gfp_t flag) | |
|
455 | ||
|
456 | Identical to dma_alloc_coherent() except that the platform will | |
|
457 | choose to return either consistent or non-consistent memory as it sees | |
|
458 | fit. By using this API, you are guaranteeing to the platform that you | |
|
459 | have all the correct and necessary sync points for this memory in the | |
|
460 | driver should it choose to return non-consistent memory. | |
|
461 | ||
|
462 | Note: where the platform can return consistent memory, it will | |
|
463 | guarantee that the sync points become nops. | |
|
464 | ||
|
465 | Warning: Handling non-consistent memory is a real pain. You should | |
|
466 | only use this API if you positively know your driver will be | |
|
467 | required to work on one of the rare (usually non-PCI) architectures | |
|
468 | that simply cannot make consistent memory. | |
|
469 | ||
|
470 | void | |
|
471 | dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr, | |
|
472 | dma_addr_t dma_handle) | |
|
473 | ||
|
474 | Free memory allocated by the nonconsistent API. All parameters must | |
|
475 | be identical to those passed in (and returned by | |
|
476 | dma_alloc_noncoherent()). | |
|
477 | ||
|
478 | int | |
|
479 | dma_get_cache_alignment(void) | |
|
480 | ||
|
481 | Returns the processor cache alignment. This is the absolute minimum | |
|
482 | alignment *and* width that you must observe when either mapping | |
|
483 | memory or doing partial flushes. | |
|
484 | ||
|
485 | Notes: This API may return a number *larger* than the actual cache | |
|
486 | line, but it will guarantee that one or more cache lines fit exactly | |
|
487 | into the width returned by this call. It will also always be a power | |
|
488 | of two for easy alignment. | |
|
489 | ||
|
490 | void | |
|
491 | dma_cache_sync(struct device *dev, void *vaddr, size_t size, | |
|
492 | enum dma_data_direction direction) | |
|
493 | ||
|
494 | Do a partial sync of memory that was allocated by | |
|
495 | dma_alloc_noncoherent(), starting at virtual address vaddr and | |
|
496 | continuing on for size. Again, you *must* observe the cache line | |
|
497 | boundaries when doing this. | |
|
498 | ||
|
499 | int | |
|
500 | dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, | |
|
501 | dma_addr_t device_addr, size_t size, int | |
|
502 | flags) | |
|
503 | ||
|
504 | Declare region of memory to be handed out by dma_alloc_coherent() when | |
|
505 | it's asked for coherent memory for this device. | |
|
506 | ||
|
507 | phys_addr is the CPU physical address to which the memory is currently | |
|
508 | assigned (this will be ioremapped so the CPU can access the region). | |
|
509 | ||
|
510 | device_addr is the bus address the device needs to be programmed | |
|
511 | with to actually address this memory (this will be handed out as the | |
|
512 | dma_addr_t in dma_alloc_coherent()). | |
|
513 | ||
|
514 | size is the size of the area (must be multiples of PAGE_SIZE). | |
|
515 | ||
|
516 | flags can be ORed together and are: | |
|
517 | ||
|
518 | DMA_MEMORY_MAP - request that the memory returned from | |
|
519 | dma_alloc_coherent() be directly writable. | |
|
520 | ||
|
521 | DMA_MEMORY_IO - request that the memory returned from | |
|
522 | dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc. | |
|
523 | ||
|
524 | One or both of these flags must be present. | |
|
525 | ||
|
526 | DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by | |
|
527 | dma_alloc_coherent of any child devices of this one (for memory residing | |
|
528 | on a bridge). | |
|
529 | ||
|
530 | DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions. | |
|
531 | Do not allow dma_alloc_coherent() to fall back to system memory when | |
|
532 | it's out of memory in the declared region. | |
|
533 | ||
|
534 | The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and | |
|
535 | must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO | |
|
536 | if only DMA_MEMORY_MAP were passed in) for success or zero for | |
|
537 | failure. | |
|
538 | ||
|
539 | Note, for DMA_MEMORY_IO returns, all subsequent memory returned by | |
|
540 | dma_alloc_coherent() may no longer be accessed directly, but instead | |
|
541 | must be accessed using the correct bus functions. If your driver | |
|
542 | isn't prepared to handle this contingency, it should not specify | |
|
543 | DMA_MEMORY_IO in the input flags. | |
|
544 | ||
|
545 | As a simplification for the platforms, only *one* such region of | |
|
546 | memory may be declared per device. | |
|
547 | ||
|
548 | For reasons of efficiency, most platforms choose to track the declared | |
|
549 | region only at the granularity of a page. For smaller allocations, | |
|
550 | you should use the dma_pool() API. | |
|
551 | ||
|
552 | void | |
|
553 | dma_release_declared_memory(struct device *dev) | |
|
554 | ||
|
555 | Remove the memory region previously declared from the system. This | |
|
556 | API performs *no* in-use checking for this region and will return | |
|
557 | unconditionally having removed all the required structures. It is the | |
|
558 | driver's job to ensure that no parts of this memory region are | |
|
559 | currently in use. | |
|
560 | ||
|
561 | void * | |
|
562 | dma_mark_declared_memory_occupied(struct device *dev, | |
|
563 | dma_addr_t device_addr, size_t size) | |
|
564 | ||
|
565 | This is used to occupy specific regions of the declared space | |
|
566 | (dma_alloc_coherent() will hand out the first free region it finds). | |
|
567 | ||
|
568 | device_addr is the *device* address of the region requested. | |
|
569 | ||
|
570 | size is the size (and should be a page-sized multiple). | |
|
571 | ||
|
572 | The return value will be either a pointer to the processor virtual | |
|
573 | address of the memory, or an error (via PTR_ERR()) if any part of the | |
|
574 | region is occupied. | |
|
575 | ||
|
576 | Part III - Debug drivers use of the DMA-API | |
|
577 | ------------------------------------------- | |
|
578 | ||
|
579 | The DMA-API as described above has some constraints. DMA addresses must be | |
|
580 | released with the corresponding function with the same size for example. With | |
|
581 | the advent of hardware IOMMUs it becomes more and more important that drivers | |
|
582 | do not violate those constraints. In the worst case such a violation can | |
|
583 | result in data corruption up to destroyed filesystems. | |
|
584 | ||
|
585 | To debug drivers and find bugs in the usage of the DMA-API checking code can | |
|
586 | be compiled into the kernel which will tell the developer about those | |
|
587 | violations. If your architecture supports it you can select the "Enable | |
|
588 | debugging of DMA-API usage" option in your kernel configuration. Enabling this | |
|
589 | option has a performance impact. Do not enable it in production kernels. | |
|
590 | ||
|
591 | If you boot the resulting kernel will contain code which does some bookkeeping | |
|
592 | about what DMA memory was allocated for which device. If this code detects an | |
|
593 | error it prints a warning message with some details into your kernel log. An | |
|
594 | example warning message may look like this: | |
|
595 | ||
|
596 | ------------[ cut here ]------------ | |
|
597 | WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448 | |
|
598 | check_unmap+0x203/0x490() | |
|
599 | Hardware name: | |
|
600 | forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong | |
|
601 | function [device address=0x00000000640444be] [size=66 bytes] [mapped as | |
|
602 | single] [unmapped as page] | |
|
603 | Modules linked in: nfsd exportfs bridge stp llc r8169 | |
|
604 | Pid: 0, comm: swapper Tainted: G W 2.6.28-dmatest-09289-g8bb99c0 #1 | |
|
605 | Call Trace: | |
|
606 | <IRQ> [<ffffffff80240b22>] warn_slowpath+0xf2/0x130 | |
|
607 | [<ffffffff80647b70>] _spin_unlock+0x10/0x30 | |
|
608 | [<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0 | |
|
609 | [<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40 | |
|
610 | [<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0 | |
|
611 | [<ffffffff80252f96>] queue_work+0x56/0x60 | |
|
612 | [<ffffffff80237e10>] enqueue_task_fair+0x20/0x50 | |
|
613 | [<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0 | |
|
614 | [<ffffffff803b78c3>] cpumask_next_and+0x23/0x40 | |
|
615 | [<ffffffff80235177>] find_busiest_group+0x207/0x8a0 | |
|
616 | [<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50 | |
|
617 | [<ffffffff803c7ea3>] check_unmap+0x203/0x490 | |
|
618 | [<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50 | |
|
619 | [<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0 | |
|
620 | [<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0 | |
|
621 | [<ffffffff8026df84>] handle_IRQ_event+0x34/0x70 | |
|
622 | [<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150 | |
|
623 | [<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0 | |
|
624 | [<ffffffff8020c093>] ret_from_intr+0x0/0xa | |
|
625 | <EOI> <4>---[ end trace f6435a98e2a38c0e ]--- | |
|
626 | ||
|
627 | The driver developer can find the driver and the device including a stacktrace | |
|
628 | of the DMA-API call which caused this warning. | |
|
629 | ||
|
630 | Per default only the first error will result in a warning message. All other | |
|
631 | errors will only silently counted. This limitation exist to prevent the code | |
|
632 | from flooding your kernel log. To support debugging a device driver this can | |
|
633 | be disabled via debugfs. See the debugfs interface documentation below for | |
|
634 | details. | |
|
635 | ||
|
636 | The debugfs directory for the DMA-API debugging code is called dma-api/. In | |
|
637 | this directory the following files can currently be found: | |
|
638 | ||
|
639 | dma-api/all_errors This file contains a numeric value. If this | |
|
640 | value is not equal to zero the debugging code | |
|
641 | will print a warning for every error it finds | |
|
642 | into the kernel log. Be careful with this | |
|
643 | option, as it can easily flood your logs. | |
|
644 | ||
|
645 | dma-api/disabled This read-only file contains the character 'Y' | |
|
646 | if the debugging code is disabled. This can | |
|
647 | happen when it runs out of memory or if it was | |
|
648 | disabled at boot time | |
|
649 | ||
|
650 | dma-api/error_count This file is read-only and shows the total | |
|
651 | numbers of errors found. | |
|
652 | ||
|
653 | dma-api/num_errors The number in this file shows how many | |
|
654 | warnings will be printed to the kernel log | |
|
655 | before it stops. This number is initialized to | |
|
656 | one at system boot and be set by writing into | |
|
657 | this file | |
|
658 | ||
|
659 | dma-api/min_free_entries | |
|
660 | This read-only file can be read to get the | |
|
661 | minimum number of free dma_debug_entries the | |
|
662 | allocator has ever seen. If this value goes | |
|
663 | down to zero the code will disable itself | |
|
664 | because it is not longer reliable. | |
|
665 | ||
|
666 | dma-api/num_free_entries | |
|
667 | The current number of free dma_debug_entries | |
|
668 | in the allocator. | |
|
669 | ||
|
670 | dma-api/driver-filter | |
|
671 | You can write a name of a driver into this file | |
|
672 | to limit the debug output to requests from that | |
|
673 | particular driver. Write an empty string to | |
|
674 | that file to disable the filter and see | |
|
675 | all errors again. | |
|
676 | ||
|
677 | If you have this code compiled into your kernel it will be enabled by default. | |
|
678 | If you want to boot without the bookkeeping anyway you can provide | |
|
679 | 'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. | |
|
680 | Notice that you can not enable it again at runtime. You have to reboot to do | |
|
681 | so. | |
|
682 | ||
|
683 | If you want to see debug messages only for a special device driver you can | |
|
684 | specify the dma_debug_driver=<drivername> parameter. This will enable the | |
|
685 | driver filter at boot time. The debug code will only print errors for that | |
|
686 | driver afterwards. This filter can be disabled or changed later using debugfs. | |
|
687 | ||
|
688 | When the code disables itself at runtime this is most likely because it ran | |
|
689 | out of dma_debug_entries. These entries are preallocated at boot. The number | |
|
690 | of preallocated entries is defined per architecture. If it is too low for you | |
|
691 | boot with 'dma_debug_entries=<your_desired_number>' to overwrite the | |
|
692 | architectural default. | |
|
693 | ||
|
694 | void debug_dmap_mapping_error(struct device *dev, dma_addr_t dma_addr); | |
|
695 | ||
|
696 | dma-debug interface debug_dma_mapping_error() to debug drivers that fail | |
|
697 | to check DMA mapping errors on addresses returned by dma_map_single() and | |
|
698 | dma_map_page() interfaces. This interface clears a flag set by | |
|
699 | debug_dma_map_page() to indicate that dma_mapping_error() has been called by | |
|
700 | the driver. When driver does unmap, debug_dma_unmap() checks the flag and if | |
|
701 | this flag is still set, prints warning message that includes call trace that | |
|
702 | leads up to the unmap. This interface can be called from dma_mapping_error() | |
|
703 | routines to enable DMA mapping error check debugging. | |
|
704 |
@@ -0,0 +1,24 | |||
|
1 | This is free and unencumbered software released into the public domain. | |
|
2 | ||
|
3 | Anyone is free to copy, modify, publish, use, compile, sell, or | |
|
4 | distribute this software, either in source code form or as a compiled | |
|
5 | binary, for any purpose, commercial or non-commercial, and by any | |
|
6 | means. | |
|
7 | ||
|
8 | In jurisdictions that recognize copyright laws, the author or authors | |
|
9 | of this software dedicate any and all copyright interest in the | |
|
10 | software to the public domain. We make this dedication for the benefit | |
|
11 | of the public at large and to the detriment of our heirs and | |
|
12 | successors. We intend this dedication to be an overt act of | |
|
13 | relinquishment in perpetuity of all present and future rights to this | |
|
14 | software under copyright law. | |
|
15 | ||
|
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
|
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
|
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
|
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
|
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
|
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
|
22 | OTHER DEALINGS IN THE SOFTWARE. | |
|
23 | ||
|
24 | For more information, please refer to <http://unlicense.org> |
@@ -0,0 +1,14 | |||
|
1 | CC := gcc | |
|
2 | CFLAGS := -g | |
|
3 | ||
|
4 | all: dma-example dma-gpio | |
|
5 | ||
|
6 | dma-example: dma-example.c | |
|
7 | $(CC) $(CFLAGS) -o dma-example dma-example.c | |
|
8 | dma-gpio: dma-gpio.c | |
|
9 | $(CC) $(CFLAGS) -O2 -std=gnu99 -o dma-gpio dma-gpio.c -lrt | |
|
10 | ||
|
11 | clean: | |
|
12 | rm -rf dma-example dma-gpio | |
|
13 | ||
|
14 | .PHONY: clean |
@@ -0,0 +1,22 | |||
|
1 | Raspberry-Pi-DMA-Example | |
|
2 | ======================== | |
|
3 | ||
|
4 | Simplest example of copying memory from one region to another using DMA ("Direct Memory Access") in userland | |
|
5 | ||
|
6 | Just type `make`, and then `sudo ./dma-example` (must use sudo to get permissions for writing to DMA peripheral) | |
|
7 | ||
|
8 | The example simply copies the string "hello world" from one place in memory to another through the use of the Raspberry Pi's DMA peripheral. | |
|
9 | ||
|
10 | Run `sudo ./dma-gpio` for an example which toggles a GPIO output pin at 500Hz using DMA. This code (dma-gpio.c) creates a 8ms circular buffer of future output states for all 64 IOs and uses DMA to sequentially copy this buffer into the memory-mapped GPIO registers at a rate of 250,000 frames per second. This allows one to output precise waveforms to any GPIO pin without worrying about Linux task scheduling. The PWM peripheral is used for pacing the DMA transaction, so simultaneous audio output will likely cause errors. Heavy network or USB usage will decrease the timing accuracy for frame rates of 500,000+ fps, due to bus-contention, but even downloading a file at 1MB/sec only has a *very* small impact at 250,000 fps. | |
|
11 | ||
|
12 | Some code, namely for translating virtual addresses to physical ones within dma-example.c, was based on that found in the Raspberry Pi FM Transmitter which I *think* is by either Oliver Mattos or Oskar Weigl, but their website has been down for a while now. Some of the code can still be found here: http://www.raspians.com/turning-the-raspberry-pi-into-an-fm-transmitter/ | |
|
13 | ||
|
14 | Problems | |
|
15 | ====== | |
|
16 | ||
|
17 | The virtual->physical mapping function in `dma-example.c` is not cache-coherent. That means that the dma engine might see different data than the cpu. The equivalent functions in `dma-gpio.c` behave correctly, so it is only a matter of porting that code once I have time. | |
|
18 | ||
|
19 | License | |
|
20 | ====== | |
|
21 | ||
|
22 | I'm putting this code in the public domain. However, the two functions in dma-example.c - `makeVirtPhysPage` and `freeVirtPhysPage` - were based on code found in the FM Transmitter, which was GPL-licensed. If you want to use this code under a non-GPL license, I would recommend replacing those functions with your own code, just to be extra safe. **Disclaimer**: I am not a lawyer. |
@@ -0,0 +1,261 | |||
|
1 | /* | |
|
2 | * https://github.com/Wallacoloo/Raspberry-Pi-DMA-Example : DMA Raspberry Pi Examples | |
|
3 | * Author: Colin Wallace | |
|
4 | ||
|
5 | This is free and unencumbered software released into the public domain. | |
|
6 | ||
|
7 | Anyone is free to copy, modify, publish, use, compile, sell, or | |
|
8 | distribute this software, either in source code form or as a compiled | |
|
9 | binary, for any purpose, commercial or non-commercial, and by any | |
|
10 | means. | |
|
11 | ||
|
12 | In jurisdictions that recognize copyright laws, the author or authors | |
|
13 | of this software dedicate any and all copyright interest in the | |
|
14 | software to the public domain. We make this dedication for the benefit | |
|
15 | of the public at large and to the detriment of our heirs and | |
|
16 | successors. We intend this dedication to be an overt act of | |
|
17 | relinquishment in perpetuity of all present and future rights to this | |
|
18 | software under copyright law. | |
|
19 | ||
|
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
|
21 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
|
22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
|
23 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
|
24 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
|
25 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
|
26 | OTHER DEALINGS IN THE SOFTWARE. | |
|
27 | ||
|
28 | For more information, please refer to <http://unlicense.org/> | |
|
29 | */ | |
|
30 | ||
|
31 | /* | |
|
32 | * processor documentation is at: http://www.raspberrypi.org/wp-content/uploads/2012/02/BCM2835-ARM-Peripherals.pdf | |
|
33 | * pg 38 for DMA | |
|
34 | */ | |
|
35 | ||
|
36 | #include <sys/mman.h> //for mmap | |
|
37 | #include <unistd.h> //for NULL | |
|
38 | #include <stdio.h> //for printf | |
|
39 | #include <stdlib.h> //for exit | |
|
40 | #include <fcntl.h> //for file opening | |
|
41 | #include <stdint.h> //for uint32_t | |
|
42 | #include <string.h> //for memset | |
|
43 | ||
|
44 | #define PAGE_SIZE 4096 //mmap maps pages of memory, so we must give it multiples of this size | |
|
45 | ||
|
46 | //physical addresses for the DMA peripherals, as found in the processor documentation: | |
|
47 | #define DMA_BASE 0x20007000 | |
|
48 | //DMA Channel register sets (format of these registers is found in DmaChannelHeader struct): | |
|
49 | #define DMACH0 0x20007000 | |
|
50 | #define DMACH1 0x20007100 | |
|
51 | #define DMACH2 0x20007200 | |
|
52 | #define DMACH3 0x20007300 | |
|
53 | //... | |
|
54 | #define DMACH(n) (DMACH0 + (n)*0x100) | |
|
55 | //Each DMA channel has some associated registers, but only CS (control and status), CONBLK_AD (control block address), and DEBUG are writeable | |
|
56 | //DMA is started by writing address of the first Control Block to the DMA channel's CONBLK_AD register and then setting the ACTIVE bit inside the CS register (bit 0) | |
|
57 | //Note: DMA channels are connected directly to peripherals, so physical addresses should be used (affects control block's SOURCE, DEST and NEXTCONBK addresses). | |
|
58 | #define DMAENABLE 0x20007ff0 //bit 0 should be set to 1 to enable channel 0. bit 1 enables channel 1, etc. | |
|
59 | ||
|
60 | //flags used in the DmaChannelHeader struct: | |
|
61 | #define DMA_CS_RESET (1<<31) | |
|
62 | #define DMA_CS_ACTIVE (1<<0) | |
|
63 | ||
|
64 | #define DMA_DEBUG_READ_ERROR (1<<2) | |
|
65 | #define DMA_DEBUG_FIFO_ERROR (1<<1) | |
|
66 | #define DMA_DEBUG_READ_LAST_NOT_SET_ERROR (1<<0) | |
|
67 | ||
|
68 | //flags used in the DmaControlBlock struct: | |
|
69 | #define DMA_CB_TI_DEST_INC (1<<4) | |
|
70 | #define DMA_CB_TI_SRC_INC (1<<8) | |
|
71 | ||
|
72 | //set bits designated by (mask) at the address (dest) to (value), without affecting the other bits | |
|
73 | //eg if x = 0b11001100 | |
|
74 | // writeBitmasked(&x, 0b00000110, 0b11110011), | |
|
75 | // then x now = 0b11001110 | |
|
76 | void writeBitmasked(volatile uint32_t *dest, uint32_t mask, uint32_t value) { | |
|
77 | uint32_t cur = *dest; | |
|
78 | uint32_t new = (cur & (~mask)) | (value & mask); | |
|
79 | *dest = new; | |
|
80 | *dest = new; //added safety for when crossing memory barriers. | |
|
81 | } | |
|
82 | ||
|
83 | struct DmaChannelHeader { | |
|
84 | uint32_t CS; //Control and Status | |
|
85 | //31 RESET; set to 1 to reset DMA | |
|
86 | //30 ABORT; set to 1 to abort current DMA control block (next one will be loaded & continue) | |
|
87 | //29 DISDEBUG; set to 1 and DMA won't be paused when debug signal is sent | |
|
88 | //28 WAIT_FOR_OUTSTANDING_WRITES; set to 1 and DMA will wait until peripheral says all writes have gone through before loading next CB | |
|
89 | //24-74 reserved | |
|
90 | //20-23 PANIC_PRIORITY; 0 is lowest priority | |
|
91 | //16-19 PRIORITY; bus scheduling priority. 0 is lowest | |
|
92 | //9-15 reserved | |
|
93 | //8 ERROR; read as 1 when error is encountered. error can be found in DEBUG register. | |
|
94 | //7 reserved | |
|
95 | //6 WAITING_FOR_OUTSTANDING_WRITES; read as 1 when waiting for outstanding writes | |
|
96 | //5 DREQ_STOPS_DMA; read as 1 if DREQ is currently preventing DMA | |
|
97 | //4 PAUSED; read as 1 if DMA is paused | |
|
98 | //3 DREQ; copy of the data request signal from the peripheral, if DREQ is enabled. reads as 1 if data is being requested, else 0 | |
|
99 | //2 INT; set when current CB ends and its INTEN=1. Write a 1 to this register to clear it | |
|
100 | //1 END; set when the transfer defined by current CB is complete. Write 1 to clear. | |
|
101 | //0 ACTIVE; write 1 to activate DMA (load the CB before hand) | |
|
102 | uint32_t CONBLK_AD; //Control Block Address | |
|
103 | uint32_t TI; //transfer information; see DmaControlBlock.TI for description | |
|
104 | uint32_t SOURCE_AD; //Source address | |
|
105 | uint32_t DEST_AD; //Destination address | |
|
106 | uint32_t TXFR_LEN; //transfer length. | |
|
107 | uint32_t STRIDE; //2D Mode Stride. Only used if TI.TDMODE = 1 | |
|
108 | uint32_t NEXTCONBK; //Next control block. Must be 256-bit aligned (32 bytes; 8 words) | |
|
109 | uint32_t DEBUG; //controls debug settings | |
|
110 | }; | |
|
111 | ||
|
112 | struct DmaControlBlock { | |
|
113 | uint32_t TI; //transfer information | |
|
114 | //31:27 unused | |
|
115 | //26 NO_WIDE_BURSTS | |
|
116 | //21:25 WAITS; number of cycles to wait between each DMA read/write operation | |
|
117 | //16:20 PERMAP; peripheral number to be used for DREQ signal (pacing). set to 0 for unpaced DMA. | |
|
118 | //12:15 BURST_LENGTH | |
|
119 | //11 SRC_IGNORE; set to 1 to not perform reads. Used to manually fill caches | |
|
120 | //10 SRC_DREQ; set to 1 to have the DREQ from PERMAP gate requests. | |
|
121 | //9 SRC_WIDTH; set to 1 for 128-bit moves, 0 for 32-bit moves | |
|
122 | //8 SRC_INC; set to 1 to automatically increment the source address after each read (you'll want this if you're copying a range of memory) | |
|
123 | //7 DEST_IGNORE; set to 1 to not perform writes. | |
|
124 | //6 DEST_DREG; set to 1 to have the DREQ from PERMAP gate *writes* | |
|
125 | //5 DEST_WIDTH; set to 1 for 128-bit moves, 0 for 32-bit moves | |
|
126 | //4 DEST_INC; set to 1 to automatically increment the destination address after each read (Tyou'll want this if you're copying a range of memory) | |
|
127 | //3 WAIT_RESP; make DMA wait for a response from the peripheral during each write. Ensures multiple writes don't get stacked in the pipeline | |
|
128 | //2 unused (0) | |
|
129 | //1 TDMODE; set to 1 to enable 2D mode | |
|
130 | //0 INTEN; set to 1 to generate an interrupt upon completion | |
|
131 | uint32_t SOURCE_AD; //Source address | |
|
132 | uint32_t DEST_AD; //Destination address | |
|
133 | uint32_t TXFR_LEN; //transfer length. | |
|
134 | uint32_t STRIDE; //2D Mode Stride. Only used if TI.TDMODE = 1 | |
|
135 | uint32_t NEXTCONBK; //Next control block. Must be 256-bit aligned (32 bytes; 8 words) | |
|
136 | uint32_t _reserved[2]; | |
|
137 | }; | |
|
138 | ||
|
139 | //allocate a page & simultaneously determine its physical address. | |
|
140 | //virtAddr and physAddr are essentially passed by-reference. | |
|
141 | //this allows for: | |
|
142 | //void *virt, *phys; | |
|
143 | //makeVirtPhysPage(&virt, &phys) | |
|
144 | //now, virt[N] exists for 0 <= N < PAGE_SIZE, | |
|
145 | // and phys+N is the physical address for virt[N] | |
|
146 | //based on http://www.raspians.com/turning-the-raspberry-pi-into-an-fm-transmitter/ | |
|
147 | void makeVirtPhysPage(void** virtAddr, void** physAddr) { | |
|
148 | *virtAddr = valloc(PAGE_SIZE); //allocate one page of RAM | |
|
149 | ||
|
150 | //force page into RAM and then lock it there: | |
|
151 | ((int*)*virtAddr)[0] = 1; | |
|
152 | mlock(*virtAddr, PAGE_SIZE); | |
|
153 | memset(*virtAddr, 0, PAGE_SIZE); //zero-fill the page for convenience | |
|
154 | ||
|
155 | //Magic to determine the physical address for this page: | |
|
156 | uint64_t pageInfo; | |
|
157 | int file = open("/proc/self/pagemap", 'r'); | |
|
158 | lseek(file, ((uint32_t)*virtAddr)/PAGE_SIZE*8, SEEK_SET); | |
|
159 | read(file, &pageInfo, 8); | |
|
160 | ||
|
161 | *physAddr = (void*)(uint32_t)(pageInfo*PAGE_SIZE); | |
|
162 | printf("makeVirtPhysPage virtual to phys: %p -> %p\n", *virtAddr, *physAddr); | |
|
163 | } | |
|
164 | ||
|
165 | //call with virtual address to deallocate a page allocated with makeVirtPhysPage | |
|
166 | void freeVirtPhysPage(void* virtAddr) { | |
|
167 | munlock(virtAddr, PAGE_SIZE); | |
|
168 | free(virtAddr); | |
|
169 | } | |
|
170 | ||
|
171 | //map a physical address into our virtual address space. memfd is the file descriptor for /dev/mem | |
|
172 | volatile uint32_t* mapPeripheral(int memfd, int addr) { | |
|
173 | ///dev/mem behaves as a file. We need to map that file into memory: | |
|
174 | void *mapped = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, memfd, addr); | |
|
175 | //now, *mapped = memory at physical address of addr. | |
|
176 | if (mapped == MAP_FAILED) { | |
|
177 | printf("failed to map memory (did you remember to run as root?)\n"); | |
|
178 | exit(1); | |
|
179 | } else { | |
|
180 | printf("mapped: %p\n", mapped); | |
|
181 | } | |
|
182 | return (volatile uint32_t*)mapped; | |
|
183 | } | |
|
184 | ||
|
185 | int main() { | |
|
186 | //cat /sys/module/dma/parameters/dmachans gives a bitmask of DMA channels that are not used by GPU. Results: ch 1, 3, 6, 7 are reserved. | |
|
187 | //dmesg | grep "DMA"; results: Ch 2 is used by SDHC host | |
|
188 | //ch 0 is known to be used for graphics acceleration | |
|
189 | //Thus, applications can use ch 4, 5, or the LITE channels @ 8 and beyond. | |
|
190 | int dmaChNum = 5; | |
|
191 | //First, open the linux device, /dev/mem | |
|
192 | //dev/mem provides access to the physical memory of the entire processor+ram | |
|
193 | //This is needed because Linux uses virtual memory, thus the process's memory at 0x00000000 will NOT have the same contents as the physical memory at 0x00000000 | |
|
194 | int memfd = open("/dev/mem", O_RDWR | O_SYNC); | |
|
195 | if (memfd < 0) { | |
|
196 | printf("Failed to open /dev/mem (did you remember to run as root?)\n"); | |
|
197 | exit(1); | |
|
198 | } | |
|
199 | //now map /dev/mem into memory, but only map specific peripheral sections: | |
|
200 | volatile uint32_t *dmaBaseMem = mapPeripheral(memfd, DMA_BASE); | |
|
201 | ||
|
202 | //configure DMA: | |
|
203 | //allocate 1 page for the source and 1 page for the destination: | |
|
204 | void *virtSrcPage, *physSrcPage; | |
|
205 | makeVirtPhysPage(&virtSrcPage, &physSrcPage); | |
|
206 | void *virtDestPage, *physDestPage; | |
|
207 | makeVirtPhysPage(&virtDestPage, &physDestPage); | |
|
208 | ||
|
209 | //write a few bytes to the source page: | |
|
210 | char *srcArray = (char*)virtSrcPage; | |
|
211 | srcArray[0] = 'h'; | |
|
212 | srcArray[1] = 'e'; | |
|
213 | srcArray[2] = 'l'; | |
|
214 | srcArray[3] = 'l'; | |
|
215 | srcArray[4] = 'o'; | |
|
216 | srcArray[5] = ' '; | |
|
217 | srcArray[6] = 'w'; | |
|
218 | srcArray[7] = 'o'; | |
|
219 | srcArray[8] = 'r'; | |
|
220 | srcArray[9] = 'l'; | |
|
221 | srcArray[10] = 'd'; | |
|
222 | srcArray[11] = 0; //null terminator used for printf call. | |
|
223 | ||
|
224 | //allocate 1 page for the control blocks | |
|
225 | void *virtCbPage, *physCbPage; | |
|
226 | makeVirtPhysPage(&virtCbPage, &physCbPage); | |
|
227 | ||
|
228 | //dedicate the first 8 words of this page to holding the cb. | |
|
229 | struct DmaControlBlock *cb1 = (struct DmaControlBlock*)virtCbPage; | |
|
230 | ||
|
231 | //fill the control block: | |
|
232 | cb1->TI = DMA_CB_TI_SRC_INC | DMA_CB_TI_DEST_INC; //after each byte copied, we want to increment the source and destination address of the copy, otherwise we'll be copying to the same address. | |
|
233 | cb1->SOURCE_AD = (uint32_t)physSrcPage; //set source and destination DMA address | |
|
234 | cb1->DEST_AD = (uint32_t)physDestPage; | |
|
235 | cb1->TXFR_LEN = 12; //transfer 12 bytes | |
|
236 | cb1->STRIDE = 0; //no 2D stride | |
|
237 | cb1->NEXTCONBK = 0; //no next control block | |
|
238 | ||
|
239 | printf("destination was initially: '%s'\n", (char*)virtDestPage); | |
|
240 | ||
|
241 | //enable DMA channel (it's probably already enabled, but we want to be sure): | |
|
242 | writeBitmasked(dmaBaseMem + DMAENABLE - DMA_BASE, 1 << dmaChNum, 1 << dmaChNum); | |
|
243 | ||
|
244 | //configure the DMA header to point to our control block: | |
|
245 | volatile struct DmaChannelHeader *dmaHeader = (volatile struct DmaChannelHeader*)(dmaBaseMem + (DMACH(dmaChNum) - DMA_BASE)/4); //dmaBaseMem is a uint32_t ptr, so divide by 4 before adding byte offset | |
|
246 | dmaHeader->CS = DMA_CS_RESET; //make sure to disable dma first. | |
|
247 | sleep(1); //give time for the reset command to be handled. | |
|
248 | dmaHeader->DEBUG = DMA_DEBUG_READ_ERROR | DMA_DEBUG_FIFO_ERROR | DMA_DEBUG_READ_LAST_NOT_SET_ERROR; // clear debug error flags | |
|
249 | dmaHeader->CONBLK_AD = (uint32_t)physCbPage; //we have to point it to the PHYSICAL address of the control block (cb1) | |
|
250 | dmaHeader->CS = DMA_CS_ACTIVE; //set active bit, but everything else is 0. | |
|
251 | ||
|
252 | sleep(1); //give time for copy to happen | |
|
253 | ||
|
254 | printf("destination reads: '%s'\n", (char*)virtDestPage); | |
|
255 | ||
|
256 | //cleanup | |
|
257 | freeVirtPhysPage(virtCbPage); | |
|
258 | freeVirtPhysPage(virtDestPage); | |
|
259 | freeVirtPhysPage(virtSrcPage); | |
|
260 | return 0; | |
|
261 | } |
This diff has been collapsed as it changes many lines, (829 lines changed) Show them Hide them | |||
@@ -0,0 +1,829 | |||
|
1 | /* | |
|
2 | * https://github.com/Wallacoloo/Raspberry-Pi-DMA-Example : DMA Raspberry Pi Examples | |
|
3 | * Author: Colin Wallace | |
|
4 | ||
|
5 | This is free and unencumbered software released into the public domain. | |
|
6 | ||
|
7 | Anyone is free to copy, modify, publish, use, compile, sell, or | |
|
8 | distribute this software, either in source code form or as a compiled | |
|
9 | binary, for any purpose, commercial or non-commercial, and by any | |
|
10 | means. | |
|
11 | ||
|
12 | In jurisdictions that recognize copyright laws, the author or authors | |
|
13 | of this software dedicate any and all copyright interest in the | |
|
14 | software to the public domain. We make this dedication for the benefit | |
|
15 | of the public at large and to the detriment of our heirs and | |
|
16 | successors. We intend this dedication to be an overt act of | |
|
17 | relinquishment in perpetuity of all present and future rights to this | |
|
18 | software under copyright law. | |
|
19 | ||
|
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
|
21 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
|
22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
|
23 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
|
24 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
|
25 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
|
26 | OTHER DEALINGS IN THE SOFTWARE. | |
|
27 | ||
|
28 | For more information, please refer to <http://unlicense.org/> | |
|
29 | */ | |
|
30 | /* | |
|
31 | * processor documentation is at: http://www.raspberrypi.org/wp-content/uploads/2012/02/BCM2835-ARM-Peripherals.pdf | |
|
32 | * pg 38 for DMA | |
|
33 | * pg 61 for DMA DREQ PERMAP | |
|
34 | * pg 89 for gpio | |
|
35 | * pg 119 for PCM | |
|
36 | * pg 138 for PWM | |
|
37 | * pg 172 for timer info | |
|
38 | * Addendum is http://www.scribd.com/doc/127599939/BCM2835-Audio-clocks | |
|
39 | * | |
|
40 | * A few annotations for GPIO/DMA/PWM are available here: https://github.com/626Pilot/RaspberryPi-NeoPixel-WS2812/blob/master/ws2812-RPi.c | |
|
41 | * https://github.com/metachris/raspberrypi-pwm/blob/master/rpio-pwm/rpio_pwm.c | |
|
42 | * https://github.com/richardghirst/PiBits/blob/master/ServoBlaster/user/servod.c | |
|
43 | * | |
|
44 | * Cache info can be found here: http://www.freelists.org/post/raspi-internals/caches,18 | |
|
45 | * 0x00000000 - L1 & L2 cache | |
|
46 | * 0x40000000 - L2 cache coherent (ie L1 writes are propagated to L2?) | |
|
47 | * 0x80000000 - L2 cache only | |
|
48 | * 0xc0000000 - direct uncached | |
|
49 | * | |
|
50 | * Useful DMA timings, etc: http://www.raspberrypi.org/forums/viewtopic.php?f=37&t=7696&start=50 | |
|
51 | * | |
|
52 | * The general idea is to have a buffer of N blocks, where each block is the same size as the gpio registers, | |
|
53 | * and have the DMA module continually copying the data in this buffer into those registers. | |
|
54 | * In this way, we can have (say) 32 blocks, and then be able to buffer the next 32 IO frames. | |
|
55 | * | |
|
56 | * How is DMA transfer rate controlled? | |
|
57 | * We can use the DREQ (data request) feature. | |
|
58 | * PWM supports a configurable data consumption clock (defaults to 100MHz) | |
|
59 | * PWM (and SPI, PCM) can fire a DREQ signal any time its fifo falls below a certain point. | |
|
60 | * But we are never filling the FIFO, so DREQ would be permanently high. | |
|
61 | * Could feed PWM with dummy data, and use 2 DMA channels (one to PWM, one to GPIO, both gated), but the write-time to GPIOs may vary from the PWM, so gating may be improper | |
|
62 | * Or we can use the WAITS portion of the CB header. This allows up to 31 cycle delay -> ~25MHz? | |
|
63 | * Will have to manually determine timing characteristics though. | |
|
64 | * Or use 2 dma channels: | |
|
65 | * Have one sending the data into PWM, which is DREQ limited | |
|
66 | * Have another copying from PWM Fifo to GPIOs at a non-limited rate. This is peripheral -> peripheral, so I think it will have its own data bus. | |
|
67 | * Unfortunately, the destination can only be one word. Luckily, we have 2 PWM channels - one for setting & one for clearing GPIOs. All gpios that are broken out into the header are in the first register (verified) | |
|
68 | * Sadly, it appears that the PWM FIFO cannot be read from. One can read the current PWM output, but only if the FIFO is disabled, in which case the DREQ is too. | |
|
69 | * | |
|
70 | **Or use 1 dma channel, but additionally write to a dreq-able peripheral (PWM): | |
|
71 | * By using control-blocks, one can copy a word to the GPIOs, then have the next CB copy a word to the PWM fifo, and repeat | |
|
72 | * By having BOTH control-blocks be dreq-limited by the PWM's dreq, they can BOTH be rate-limited. | |
|
73 | * PWM clock works as so: 500MHz / clock_div = PWM_BITRATE (note: bitrate!) | |
|
74 | * PWM_BITRATE / PWM_RNG1 = #of FIFO writes/sec | |
|
75 | * Max PWM_BITRATE = 25MHz | |
|
76 | * Also, dest_addr = 0x7e20b000 // the testbus interface which is a dump peripheral that goes nowhere (http://www.raspberrypi.org/forums/viewtopic.php?f=37&t=7696&start=25 ) | |
|
77 | * | |
|
78 | * DMA Control Block layout: | |
|
79 | * repeat #srcBlock times: | |
|
80 | * 1.copy srcBlock to gpios | |
|
81 | * 2.zero srcBlock | |
|
82 | * 3.move byte to PWM (paced via DREQ) | |
|
83 | * These are largely redundant; it may be possible to use less memory (each cb uses 32 bytes of memory) | |
|
84 | * | |
|
85 | * Problem: each "frame" is currently 6 words (but the last word is padding), and 1 PAGE_SIZE is not an integer multiple of 6*4 | |
|
86 | * Thus, the very last frame on each page cannot be used with DMA. Because of this, too, the virtual addressing of each frame is messed up - we must skip one frame per page. | |
|
87 | * One solution is to append 2 more pad words to each frame (so that it is 8 words in length). This fixes all issues, but increases ram usage and potentially cache problems (L2 is 128KB). However, since data reads are sequential, even if all data doesn't fit in cache, it will be prefetched. | |
|
88 | * Another solution is to decrease frame size to 4 words, and use 2 control blocks for each frame (thus eliminating the 1-byte padding in the center). This will have an even LARGER impact on ram usage - effectively using 20 words/frame vs current 14 words/frame & alternative 16words/frame | |
|
89 | * Another solution is to directly mix src data with CB data. Each CB has 2 words of padding, and a data frame is 5 words, and each CB must be aligned to 8 words. Therefore, the following is possible, assuming each frame requires 3 CBs: | |
|
90 | * CB1.1(padded) | CB1.2(padded) | CB1.3(padded) | CB2.1(padded) | CB2.2(padded) | CB2.3(unpadded) | SRC(5) | SRC(5) <- uses 56 words per 2 frames | |
|
91 | * HOWEVER, PAGE_SIZE is not an integral multiple of 56 words | |
|
92 | * Although, 1 of those CBs (the one which zeros the previous source) could be shared amongst multiple frames - that is, only zero every, say, 4 frames. The effect is: | |
|
93 | * *32 words for 1 frame grouped (5 src words - 2 means pad to 8 words for src) | |
|
94 | * 48 words for 2 frames grouped (10 src words - 2 means pad to 8 words for src) | |
|
95 | * 72 words for 3 frames grouped (15 src words - 2 means pad to 16 words for src) | |
|
96 | * 96 words for 4 frames grouped (20 src words - 2 means pad to 24 words for src) | |
|
97 | * 112 words for 5 frames grouped(25 src words - 2 means pad to 24 words for src) | |
|
98 | * 136 words for 6 frames grouped(30 src words - 2 means pad to 32 words for src) | |
|
99 | * 160 words for 7 frames grouped(35 src words - 2 means pad to 40 words for src) | |
|
100 | * 176 words for 8 frames grouped(40 src words - 2 means pad to 40 words for src) | |
|
101 | * 200 words for 9 frames grouped(45 src words - 2 means pad to 48 words for src) | |
|
102 | * 216 words for 10frames grouped(50 src words - 2 means pad to 48 words for src) | |
|
103 | * 240 words for 11frames grouped(55 src words - 2 means pad to 56 words for src) | |
|
104 | * 264 words for 12frames grouped(60 src words - 2 means pad to 64 words for src) | |
|
105 | * ...432 words for 20frames grouped(100src words - 2 means pad to 104 words for src) | |
|
106 | * ...*512 words for 24frames grouped(120src words - 2 means pad to 120 words for src) | |
|
107 | * As can be seen, this still requires extra padding. Could do 128 words for 5 frames, or 256 words for 11 frames (23.3 words/frame), and that requires funky math. | |
|
108 | * The 24 frame option would work OK. 24 is a relatively easy number to work with, and 21.3 words/frame (limit is 21 words/frame) | |
|
109 | * Another solution is to use the 2D stride functionality. The source frame is really 4 words and the destination is really 2 words, a 1 word gap, and then the other 2 words. Thus 2d stride can be used to skip over that one word gap. | |
|
110 | * | |
|
111 | * How to determine the current source word being processed? | |
|
112 | * dma header points to the physical CONBLOCK_AD. This can be linked to the virtual source address via a map. | |
|
113 | * OR: STRIDE register is unused in 1D mode. Could write the src index that this block is linked to in that register. But then we can't use stride feature. | |
|
114 | * Rather, we can't use the stride feature on ONE cb per frame. So, use stride on the buffer->GPIO cb, and use the stride register to indicate index on the zeros-copy and the PWM cb. Can tell which CB we're looking at based on the 2DEN flag. If we're looking at the buffer->GPIO cb, then instead look at NEXTCON_BK | |
|
115 | * NOTE: if 2d stride is disabled, it appears that the DMA engine doesn't even load the STRIDE register (it's read as garbage). It may PERHAPS display the last loaded word. | |
|
116 | * Note: unused fields are read as "Don't care", meaning we can't use them to store user-data. | |
|
117 | * | |
|
118 | * http://www.raspberrypi.org/forums/viewtopic.php?f=44&t=26907 | |
|
119 | * Says gpu halts all DMA for 16us every 500ms. To bypass, add 'disable_pvt=1' to /boot/cmdline.txt | |
|
120 | * http://www.raspberrypi.org/forums/viewtopic.php?f=37&t=7696&start=25 | |
|
121 | * Says it's possible to get access to a 250MHz clock. | |
|
122 | * How to make DMA more consistent (ie reduce bus contention?): | |
|
123 | * disable interrupts 1 uS before any 'real' transaction, enable them afterwards | |
|
124 | * Make sure dummy writes DON'T READ FROM RAM (ie, use src_ignore = 1) | |
|
125 | * boot with disable_pvt=1 (prevents gpu from halting everything to adjust ram refresh rate twice per second) in /boot/cmdline.txt. Does this affect system stability? | |
|
126 | */ | |
|
127 | ||
|
128 | #include <sys/mman.h> //for mmap | |
|
129 | #include <sys/time.h> //for timespec | |
|
130 | #include <time.h> //for timespec / nanosleep (need -std=gnu99) | |
|
131 | #include <signal.h> //for sigaction | |
|
132 | #include <unistd.h> //for NULL | |
|
133 | #include <stdio.h> //for printf | |
|
134 | #include <stdlib.h> //for exit, valloc | |
|
135 | //#include <malloc.h> //some implementations declare valloc inside malloc.h | |
|
136 | #include <fcntl.h> //for file opening | |
|
137 | #include <stdint.h> //for uint32_t | |
|
138 | #include <string.h> //for memset | |
|
139 | #include <errno.h> //for errno | |
|
140 | #include <pthread.h> //for pthread_setschedparam | |
|
141 | ||
|
142 | //config settings: | |
|
143 | #define PWM_FIFO_SIZE 1 //The DMA transaction is paced through the PWM FIFO. The PWM FIFO consumes 1 word every N uS (set in clock settings). Once the fifo has fewer than PWM_FIFO_SIZE words available, it will request more data from DMA. Thus, a high buffer length will be more resistant to clock drift, but may occasionally request multiple frames in a short succession (faster than FRAME_PER_SEC) in the presence of bus contention, whereas a low buffer length will always space frames AT LEAST 1/FRAMES_PER_SEC seconds apart, but may experience clock drift. | |
|
144 | #define SOURCE_BUFFER_FRAMES 8192 //number of gpio timeslices to buffer. These are processed at ~1 million/sec. So 1000 framse is 1 ms. Using a power-of-two is a good idea as it simplifies some of the arithmetic (modulus operations) | |
|
145 | #define SCHED_PRIORITY 30 //Linux scheduler priority. Higher = more realtime | |
|
146 | ||
|
147 | #define NOMINAL_CLOCK_FREQ 500000000 //PWM Clock runs at 500 MHz, unless overclocking | |
|
148 | #define BITS_PER_CLOCK 10 //# of bits to be used in each PWM cycle. Effectively acts as a clock divisor for us, since the PWM clock is in bits/second | |
|
149 | #define CLOCK_DIV 200 //# to divide the NOMINAL_CLOCK_FREQ by before passing it to the PWM peripheral. | |
|
150 | //gpio frames per second is a product of the nominal clock frequency divided by BITS_PER_CLOCK and divided again by CLOCK_DIV | |
|
151 | //At 500,000 frames/sec, memory bandwidth does not appear to be an issue (jitter of -1 to +2 uS) | |
|
152 | //attempting 1,000,000 frames/sec results in an actual 800,000 frames/sec, though with a lot of jitter. | |
|
153 | //Note that these numbers might very with heavy network or usb usage. | |
|
154 | // eg at 500,000 fps, with 1MB/sec network download, jitter is -1 to +30 uS | |
|
155 | // at 250,000 fps, with 1MB/sec network download, jitter is only -3 to +3 uS | |
|
156 | #define FRAMES_PER_SEC NOMINAL_CLOCK_FREQ/BITS_PER_CLOCK/CLOCK_DIV | |
|
157 | #define SEC_TO_FRAME(s) ((int64_t)(s)*FRAMES_PER_SEC) | |
|
158 | #define USEC_TO_FRAME(u) (SEC_TO_FRAME(u)/1000000) | |
|
159 | #define FRAME_TO_SEC(f) ((int64_t)(f)*BITS_PER_CLOCK*CLOCK_DIV/NOMINAL_CLOCK_FREQ) | |
|
160 | #define FRAME_TO_USEC(f) FRAME_TO_SEC((int64_t)(f)*1000000) | |
|
161 | ||
|
162 | #define TIMER_BASE 0x20003000 | |
|
163 | #define TIMER_CLO 0x00000004 //lower 32-bits of 1 MHz timer | |
|
164 | #define TIMER_CHI 0x00000008 //upper 32-bits | |
|
165 | ||
|
166 | ||
|
167 | #define GPIO_BASE 0x20200000 //base address of the GPIO control registers. | |
|
168 | #define GPIO_BASE_BUS 0x7E200000 //this is the physical bus address of the GPIO module. This is only used when other peripherals directly connected to the bus (like DMA) need to read/write the GPIOs | |
|
169 | #define PAGE_SIZE 4096 //mmap maps pages of memory, so we must give it multiples of this size | |
|
170 | #define GPFSEL0 0x00000000 //gpio function select. There are 6 of these (32 bit registers) | |
|
171 | #define GPFSEL1 0x00000004 | |
|
172 | #define GPFSEL2 0x00000008 | |
|
173 | #define GPFSEL3 0x0000000c | |
|
174 | #define GPFSEL4 0x00000010 | |
|
175 | #define GPFSEL5 0x00000014 | |
|
176 | //bits 2-0 of GPFSEL0: set to 000 to make Pin 0 an output. 001 is an input. Other combinations represent alternate functions | |
|
177 | //bits 3-5 are for pin 1. | |
|
178 | //... | |
|
179 | //bits 27-29 are for pin 9. | |
|
180 | //GPFSEL1 repeats, but bits 2-0 are Pin 10, 27-29 are pin 19. | |
|
181 | //... | |
|
182 | #define GPSET0 0x0000001C //GPIO Pin Output Set. There are 2 of these (32 bit registers) | |
|
183 | #define GPSET1 0x00000020 | |
|
184 | //writing a '1' to bit N of GPSET0 makes that pin HIGH. | |
|
185 | //writing a '0' has no effect. | |
|
186 | //GPSET0[0-31] maps to pins 0-31 | |
|
187 | //GPSET1[0-21] maps to pins 32-53 | |
|
188 | #define GPCLR0 0x00000028 //GPIO Pin Output Clear. There are 2 of these (32 bits each) | |
|
189 | #define GPCLR1 0x0000002C | |
|
190 | //GPCLR acts the same way as GPSET, but clears the pin instead. | |
|
191 | #define GPLEV0 0x00000034 //GPIO Pin Level. There are 2 of these (32 bits each) | |
|
192 | ||
|
193 | //physical addresses for the DMA peripherals, as found in the processor documentation: | |
|
194 | #define DMA_BASE 0x20007000 | |
|
195 | #define DMACH(n) (0x100*(n)) | |
|
196 | //DMA Channel register sets (format of these registers is found in DmaChannelHeader struct): | |
|
197 | //#define DMACH0 0x00000000 | |
|
198 | //#define DMACH1 0x00000100 | |
|
199 | //#define DMACH2 0x00000200 | |
|
200 | //#define DMACH3 0x00000300 | |
|
201 | //... | |
|
202 | //Each DMA channel has some associated registers, but only CS (control and status), CONBLK_AD (control block address), and DEBUG are writeable | |
|
203 | //DMA is started by writing address of the first Control Block to the DMA channel's CONBLK_AD register and then setting the ACTIVE bit inside the CS register (bit 0) | |
|
204 | //Note: DMA channels are connected directly to peripherals, so physical addresses should be used (affects control block's SOURCE, DEST and NEXTCONBK addresses). | |
|
205 | #define DMAENABLE 0x00000ff0 //bit 0 should be set to 1 to enable channel 0. bit 1 enables channel 1, etc. | |
|
206 | ||
|
207 | //flags used in the DmaChannelHeader struct: | |
|
208 | #define DMA_CS_RESET (1<<31) | |
|
209 | #define DMA_CS_ABORT (1<<30) | |
|
210 | #define DMA_CS_DISDEBUG (1<<28) //DMA will not stop when debug signal is asserted | |
|
211 | #define DMA_CS_PRIORITY(x) ((x)&0xf << 16) //higher priority DMA transfers are serviced first, it would appear | |
|
212 | #define DMA_CS_PRIORITY_MAX DMA_CS_PRIORITY(7) | |
|
213 | #define DMA_CS_PANIC_PRIORITY(x) ((x)&0xf << 20) | |
|
214 | #define DMA_CS_PANIC_PRIORITY_MAX DMA_CS_PANIC_PRIORITY(7) | |
|
215 | #define DMA_CS_END (1<<1) | |
|
216 | #define DMA_CS_ACTIVE (1<<0) | |
|
217 | ||
|
218 | #define DMA_DEBUG_READ_ERROR (1<<2) | |
|
219 | #define DMA_DEBUG_FIFO_ERROR (1<<1) | |
|
220 | #define DMA_DEBUG_READ_LAST_NOT_SET_ERROR (1<<0) | |
|
221 | ||
|
222 | //flags used in the DmaControlBlock struct: | |
|
223 | #define DMA_CB_TI_NO_WIDE_BURSTS (1<<26) | |
|
224 | #define DMA_CB_TI_PERMAP_NONE (0<<16) | |
|
225 | #define DMA_CB_TI_PERMAP_DSI (1<<16) | |
|
226 | //... (more found on page 61 of BCM2835 pdf | |
|
227 | #define DMA_CB_TI_PERMAP_PWM (5<<16) | |
|
228 | //... | |
|
229 | #define DMA_CB_TI_SRC_DREQ (1<<10) | |
|
230 | #define DMA_CB_TI_SRC_INC (1<<8) | |
|
231 | #define DMA_CB_TI_DEST_DREQ (1<<6) | |
|
232 | #define DMA_CB_TI_DEST_INC (1<<4) | |
|
233 | #define DMA_CB_TI_TDMODE (1<<1) | |
|
234 | ||
|
235 | ||
|
236 | //https://dev.openwrt.org/browser/trunk/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch?rev=39770 says that YLENGTH should actually be written as # of copies *MINUS ONE* | |
|
237 | #define DMA_CB_TXFR_LEN_YLENGTH(y) (((y-1)&0x4fff) << 16) | |
|
238 | #define DMA_CB_TXFR_LEN_XLENGTH(x) ((x)&0xffff) | |
|
239 | #define DMA_CB_TXFR_YLENGTH_MASK (0x4fff << 16) | |
|
240 | #define DMA_CB_STRIDE_D_STRIDE(x) (((x)&0xffff) << 16) | |
|
241 | #define DMA_CB_STRIDE_S_STRIDE(x) ((x)&0xffff) | |
|
242 | ||
|
243 | ||
|
244 | //Dma Control Blocks must be located at addresses that are multiples of 32 bytes | |
|
245 | #define DMA_CONTROL_BLOCK_ALIGNMENT 32 | |
|
246 | ||
|
247 | #define PWM_BASE 0x2020C000 | |
|
248 | #define PWM_BASE_BUS 0x7E20C000 | |
|
249 | #define PWM_CTL 0x00000000 //control register | |
|
250 | #define PWM_STA 0x00000004 //status register | |
|
251 | #define PWM_DMAC 0x00000008 //DMA control register | |
|
252 | #define PWM_RNG1 0x00000010 //channel 1 range register (# output bits to use per sample) | |
|
253 | #define PWM_DAT1 0x00000014 //channel 1 data | |
|
254 | #define PWM_FIF1 0x00000018 //channel 1 fifo (write to this register to queue an output) | |
|
255 | #define PWM_RNG2 0x00000020 //channel 2 range register | |
|
256 | #define PWM_DAT2 0x00000024 //channel 2 data | |
|
257 | ||
|
258 | #define PWM_CTL_USEFIFO2 (1<<13) | |
|
259 | #define PWM_CTL_REPEATEMPTY2 (1<<10) | |
|
260 | #define PWM_CTL_ENABLE2 (1<<8) | |
|
261 | #define PWM_CTL_CLRFIFO (1<<6) | |
|
262 | #define PWM_CTL_USEFIFO1 (1<<5) | |
|
263 | #define PWM_CTL_REPEATEMPTY1 (1<<2) | |
|
264 | #define PWM_CTL_ENABLE1 (1<<0) | |
|
265 | ||
|
266 | #define PWM_STA_BUSERR (1<<8) | |
|
267 | #define PWM_STA_GAPERRS (0xf << 4) | |
|
268 | #define PWM_STA_FIFOREADERR (1<<3) | |
|
269 | #define PWM_STA_FIFOWRITEERR (1<<2) | |
|
270 | #define PWM_STA_ERRS PWM_STA_BUSERR | PWM_STA_GAPERRS | PWM_STA_FIFOREADERR | PWM_STA_FIFOWRITEERR | |
|
271 | ||
|
272 | #define PWM_DMAC_EN (1<<31) | |
|
273 | #define PWM_DMAC_PANIC(P) (((P)&0xff)<<8) | |
|
274 | #define PWM_DMAC_DREQ(D) (((D)&0xff)<<0) | |
|
275 | ||
|
276 | //The following is undocumented :( Taken from http://www.scribd.com/doc/127599939/BCM2835-Audio-clocks | |
|
277 | #define CLOCK_BASE 0x20101000 | |
|
278 | #define CM_PWMCTL 0xa0 | |
|
279 | #define CM_PWMDIV 0xa4 | |
|
280 | //each write to CM_PWMTL and CM_PWMDIV requires the password to be written: | |
|
281 | #define CM_PWMCTL_PASSWD 0x5a000000 | |
|
282 | #define CM_PWMDIV_PASSWD 0x5a000000 | |
|
283 | //MASH is used to achieve fractional clock dividers by introducing artificial jitter. | |
|
284 | //if you want constant frequency (even if it may not be at 100% CORRECT frequency), use MASH0 | |
|
285 | //if clock divisor is integral, then there's no need to use MASH, and anything above MASH1 can introduce jitter. | |
|
286 | #define CM_PWMCTL_MASH(x) (((x)&0x3) << 9) | |
|
287 | #define CM_PWMCTL_MASH0 CM_PWMTRL_MASH(0) | |
|
288 | #define CM_PWMCTL_MASH1 CM_PWMTRL_MASH(1) | |
|
289 | #define CM_PWMCTL_MASH2 CM_PWMTRL_MASH(2) | |
|
290 | #define CM_PWMCTL_MASH3 CM_PWMTRL_MASH(3) | |
|
291 | #define CM_PWMCTL_FLIP (1<<8) //use to inverse clock polarity | |
|
292 | #define CM_PWMCTL_BUSY (1<<7) //read-only flag that indicates clock generator is running. | |
|
293 | #define CM_PWMCTL_KILL (1<<5) //write a 1 to stop & reset clock generator. USED FOR DEBUG ONLY | |
|
294 | #define CM_PWMCTL_ENAB (1<<4) //gracefully stop/start clock generator. BUSY flag will go low once clock is off. | |
|
295 | #define CM_PWMCTL_SRC(x) ((x)&0xf) //clock source. 0=gnd. 1=oscillator. 2-3=debug. 4=PLLA per. 5=PLLC per. 6=PLLD per. 7=HDMI aux. 8-15=GND | |
|
296 | #define CM_PWMCTL_SRC_OSC CM_PWMCTL_SRC(1) | |
|
297 | #define CM_PWMCTL_SRC_PLLA CM_PWMCTL_SRC(4) | |
|
298 | #define CM_PWMCTL_SRC_PLLC CM_PWMCTL_SRC(5) | |
|
299 | #define CM_PWMCTL_SRC_PLLD CM_PWMCTL_SRC(6) | |
|
300 | ||
|
301 | //max clock divisor is 4095 | |
|
302 | #define CM_PWMDIV_DIVI(x) (((x)&0xfff) << 12) | |
|
303 | #define CM_PWMDIV_DIVF(x) ((x)&0xfff) | |
|
304 | ||
|
305 | struct DmaChannelHeader { | |
|
306 | //Note: dma channels 7-15 are 'LITE' dma engines (or is it 8-15?), with reduced performance & functionality. | |
|
307 | //Note: only CS, CONBLK_AD and DEBUG are directly writeable | |
|
308 | volatile uint32_t CS; //Control and Status | |
|
309 | //31 RESET; set to 1 to reset DMA | |
|
310 | //30 ABORT; set to 1 to abort current DMA control block (next one will be loaded & continue) | |
|
311 | //29 DISDEBUG; set to 1 and DMA won't be paused when debug signal is sent | |
|
312 | //28 WAIT_FOR_OUTSTANDING_WRITES(0x10000000); set to 1 and DMA will wait until peripheral says all writes have gone through before loading next CB | |
|
313 | //24-74 reserved | |
|
314 | //20-23 PANIC_PRIORITY; 0 is lowest priority | |
|
315 | //16-19 PRIORITY; bus scheduling priority. 0 is lowest | |
|
316 | //9-15 reserved | |
|
317 | //8 ERROR; read as 1 when error is encountered. error can be found in DEBUG register. | |
|
318 | //7 reserved | |
|
319 | //6 WAITING_FOR_OUTSTANDING_WRITES; read as 1 when waiting for outstanding writes | |
|
320 | //5 DREQ_STOPS_DMA(0x20); read as 1 if DREQ is currently preventing DMA | |
|
321 | //4 PAUSED(0x10); read as 1 if DMA is paused | |
|
322 | //3 DREQ; copy of the data request signal from the peripheral, if DREQ is enabled. reads as 1 if data is being requested (or PERMAP=0), else 0 | |
|
323 | //2 INT; set when current CB ends and its INTEN=1. Write a 1 to this register to clear it | |
|
324 | //1 END; set when the transfer defined by current CB is complete. Write 1 to clear. | |
|
325 | //0 ACTIVE(0x01); write 1 to activate DMA (load the CB before hand) | |
|
326 | volatile uint32_t CONBLK_AD; //Control Block Address | |
|
327 | volatile uint32_t TI; //transfer information; see DmaControlBlock.TI for description | |
|
328 | volatile uint32_t SOURCE_AD; //Source address | |
|
329 | volatile uint32_t DEST_AD; //Destination address | |
|
330 | volatile uint32_t TXFR_LEN; //transfer length. ONLY THE LOWER 16 BITS ARE USED IN LITE DMA ENGINES | |
|
331 | volatile uint32_t STRIDE; //2D Mode Stride. Only used if TI.TDMODE = 1. NOT AVAILABLE IN LITE DMA ENGINES | |
|
332 | volatile uint32_t NEXTCONBK; //Next control block. Must be 256-bit aligned (32 bytes; 8 words) | |
|
333 | volatile uint32_t DEBUG; //controls debug settings | |
|
334 | //29-31 unused | |
|
335 | //28 LITE (0x10000000) | |
|
336 | //25-27 VERSION | |
|
337 | //16-24 DMA_STATE (dma engine state machine) | |
|
338 | //8-15 DMA_ID (AXI bus id) | |
|
339 | //4-7 OUTSTANDING_WRITES | |
|
340 | //3 unused | |
|
341 | //2 READ_ERROR | |
|
342 | //1 WRITE_ERROR | |
|
343 | //0 READ_LAST_NOT_SET_ERROR | |
|
344 | }; | |
|
345 | void logDmaChannelHeader(struct DmaChannelHeader *h) { | |
|
346 | printf("Dma Ch Header:\n CS: 0x%08x\n CONBLK_AD: 0x%08x\n TI: 0x%08x\n SOURCE_AD: 0x%08x\n DEST_AD: 0x%08x\n TXFR_LEN: %u\n STRIDE: 0x%08x\n NEXTCONBK: 0x%08x\n DEBUG: 0x%08x\n", h->CS, h->CONBLK_AD, h->TI, h->SOURCE_AD, h->DEST_AD, h->TXFR_LEN, h->STRIDE, h->NEXTCONBK, h->DEBUG); | |
|
347 | } | |
|
348 | ||
|
349 | struct DmaControlBlock { | |
|
350 | volatile uint32_t TI; //transfer information | |
|
351 | //31:27 unused | |
|
352 | //26 NO_WIDE_BURSTS | |
|
353 | //21:25 WAITS; number of cycles to wait between each DMA read/write operation | |
|
354 | //16:20 PERMAP(0x000Y0000); peripheral number to be used for DREQ signal (pacing). set to 0 for unpaced DMA. | |
|
355 | //12:15 BURST_LENGTH | |
|
356 | //11 SRC_IGNORE; set to 1 to not perform reads. Used to manually fill caches | |
|
357 | //10 SRC_DREQ; set to 1 to have the DREQ from PERMAP gate requests. | |
|
358 | //9 SRC_WIDTH; set to 1 for 128-bit moves, 0 for 32-bit moves | |
|
359 | //8 SRC_INC; set to 1 to automatically increment the source address after each read (you'll want this if you're copying a range of memory) | |
|
360 | //7 DEST_IGNORE; set to 1 to not perform writes. | |
|
361 | //6 DEST_DREQ; set to 1 to have the DREQ from PERMAP gate *writes* | |
|
362 | //5 DEST_WIDTH; set to 1 for 128-bit moves, 0 for 32-bit moves | |
|
363 | //4 DEST_INC; set to 1 to automatically increment the destination address after each read (Tyou'll want this if you're copying a range of memory) | |
|
364 | //3 WAIT_RESP; make DMA wait for a response from the peripheral during each write. Ensures multiple writes don't get stacked in the pipeline | |
|
365 | //2 unused (0) | |
|
366 | //1 TDMODE; set to 1 to enable 2D mode | |
|
367 | //0 INTEN; set to 1 to generate an interrupt upon completion | |
|
368 | volatile uint32_t SOURCE_AD; //Source address | |
|
369 | volatile uint32_t DEST_AD; //Destination address | |
|
370 | volatile uint32_t TXFR_LEN; //transfer length. | |
|
371 | //in 2D mode, TXFR_LEN is separated into two half-words to indicate Y transfers of length X, and STRIDE is added to the src/dest address after each transfer of length X. | |
|
372 | //30:31 unused | |
|
373 | //16-29 YLENGTH | |
|
374 | //0-15 XLENGTH | |
|
375 | volatile uint32_t STRIDE; //2D Mode Stride (amount to increment/decrement src/dest after each 1d copy when in 2d mode). Only used if TI.TDMODE = 1 | |
|
376 | //16-31 D_STRIDE; signed (2's complement) byte increment/decrement to apply to destination addr after each XLENGTH transfer | |
|
377 | //0-15 S_STRIDE; signed (2's complement) byte increment/decrement to apply to source addr after each XLENGTH transfer | |
|
378 | volatile uint32_t NEXTCONBK; //Next control block. Must be 256-bit aligned (32 bytes; 8 words) | |
|
379 | uint32_t _reserved[2]; | |
|
380 | }; | |
|
381 | ||
|
382 | void logDmaControlBlock(struct DmaControlBlock *b) { | |
|
383 | printf("Dma Control Block:\n TI: 0x%08x\n SOURCE_AD: 0x%08x\n DEST_AD: 0x%08x\n TXFR_LEN: 0x%08x\n STRIDE: 0x%08x\n NEXTCONBK: 0x%08x\n unused: 0x%08x %08x\n", b->TI, b->SOURCE_AD, b->DEST_AD, b->TXFR_LEN, b->STRIDE, b->NEXTCONBK, b->_reserved[0], b->_reserved[1]); | |
|
384 | } | |
|
385 | ||
|
386 | struct PwmHeader { | |
|
387 | volatile uint32_t CTL; // 0x00000000 //control register | |
|
388 | //16-31 reserved | |
|
389 | //15 MSEN2 (0: PWM algorithm, 1:M/S transmission used) | |
|
390 | //14 reserved | |
|
391 | //13 USEF2 (0: data register is used for transmission, 1: FIFO is used for transmission) | |
|
392 | //12 POLA2 (0: 0=low, 1=high. 1: 0=high, 1=low (inversion)) | |
|
393 | //11 SBIT2; defines the state of the output when no transmission is in place | |
|
394 | //10 RPTL2; 0: transmission interrupts when FIFO is empty. 1: last data in FIFO is retransmitted when FIFO is empty | |
|
395 | //9 MODE2; 0: PWM mode. 1: serializer mode | |
|
396 | //8 PWMEN2; 0: channel is disabled. 1: channel is enabled | |
|
397 | //7 MSEN1; | |
|
398 | //6 CLRF1; writing a 1 to this bit clears the channel 1 (and channel 2?) fifo | |
|
399 | //5 USEF1; | |
|
400 | //4 POLA1; | |
|
401 | //3 SBIT1; | |
|
402 | //2 RPTL1; | |
|
403 | //1 MODE1; | |
|
404 | //0 PWMEN1; | |
|
405 | volatile uint32_t STA; // 0x00000004 //status register | |
|
406 | //13-31 reserved | |
|
407 | //9-12 STA1-4; indicates whether each channel is transmitting | |
|
408 | //8 BERR; Bus Error Flag. Write 1 to clear | |
|
409 | //4-7 GAPO1-4; Gap Occured Flag. Write 1 to clear | |
|
410 | //3 RERR1; Fifo Read Error Flag (attempt to read empty fifo). Write 1 to clear | |
|
411 | //2 WERR1; Fifo Write Error Flag (attempt to write to full fifo). Write 1 to clear | |
|
412 | //1 EMPT1; Reads as 1 if fifo is empty | |
|
413 | //0 FULL1; Reads as 1 if fifo is full | |
|
414 | volatile uint32_t DMAC; // 0x00000008 //DMA control register | |
|
415 | //31 ENAB; set to 1 to enable DMA | |
|
416 | //16-30 reserved | |
|
417 | //8-15 PANIC; DMA threshold for panic signal | |
|
418 | //0-7 DREQ; DMA threshold for DREQ signal | |
|
419 | uint32_t _padding1; | |
|
420 | volatile uint32_t RNG1; // 0x00000010 //channel 1 range register (# output bits to use per sample) | |
|
421 | //0-31 PWM_RNGi; #of bits to modulate PWM. (eg if PWM_RNGi=1024, then each 32-bit sample sent through the FIFO will be modulated into 1024 bits.) | |
|
422 | volatile uint32_t DAT1; // 0x00000014 //channel 1 data | |
|
423 | //0-31 PWM_DATi; Stores the 32-bit data to be sent to the PWM controller ONLY WHEN USEFi=0 (FIFO is disabled) | |
|
424 | volatile uint32_t FIF1; // 0x00000018 //channel 1 fifo (write to this register to queue an output) | |
|
425 | //writing to this register will queue a sample into the fifo. If 2 channels are enabled, then each even sample (0-indexed) is sent to channel 1, and odd samples are sent to channel 2. WRITE-ONLY | |
|
426 | uint32_t _padding2; | |
|
427 | volatile uint32_t RNG2; // 0x00000020 //channel 2 range register | |
|
428 | volatile uint32_t DAT2; // 0x00000024 //channel 2 data | |
|
429 | //0-31 PWM_DATi; Stores the 32-bit data to be sent to the PWM controller ONLY WHEN USEFi=1 (FIFO is enabled). TODO: Typo??? | |
|
430 | }; | |
|
431 | ||
|
432 | struct GpioBufferFrame { | |
|
433 | //custom structure used for storing the GPIO buffer. | |
|
434 | //These BufferFrame's are DMA'd into the GPIO memory, potentially using the DmaEngine's Stride facility | |
|
435 | uint32_t gpset[2]; | |
|
436 | uint32_t gpclr[2]; | |
|
437 | }; | |
|
438 | ||
|
439 | struct DmaChannelHeader *dmaHeader; //must be global for cleanup() | |
|
440 | ||
|
441 | void setSchedPriority(int priority) { | |
|
442 | //In order to get the best timing at a decent queue size, we want the kernel to avoid interrupting us for long durations. | |
|
443 | //This is done by giving our process a high priority. Note, must run as super-user for this to work. | |
|
444 | struct sched_param sp; | |
|
445 | sp.sched_priority=priority; | |
|
446 | int ret; | |
|
447 | if (ret = pthread_setschedparam(pthread_self(), SCHED_FIFO, &sp)) { | |
|
448 | printf("Warning: pthread_setschedparam (increase thread priority) returned non-zero: %i\n", ret); | |
|
449 | } | |
|
450 | } | |
|
451 | ||
|
452 | void writeBitmasked(volatile uint32_t *dest, uint32_t mask, uint32_t value) { | |
|
453 | //set bits designated by (mask) at the address (dest) to (value), without affecting the other bits | |
|
454 | //eg if x = 0b11001100 | |
|
455 | // writeBitmasked(&x, 0b00000110, 0b11110011), | |
|
456 | // then x now = 0b11001110 | |
|
457 | uint32_t cur = *dest; | |
|
458 | uint32_t new = (cur & (~mask)) | (value & mask); | |
|
459 | *dest = new; | |
|
460 | *dest = new; //best to be safe when crossing memory boundaries | |
|
461 | } | |
|
462 | ||
|
463 | ||
|
464 | uint64_t readSysTime(volatile uint32_t *timerBaseMem) { | |
|
465 | return ((uint64_t)*(timerBaseMem + TIMER_CHI/4) << 32) + (uint64_t)(*(timerBaseMem + TIMER_CLO/4)); | |
|
466 | } | |
|
467 | ||
|
468 | size_t ceilToPage(size_t size) { | |
|
469 | //round up to nearest page-size multiple | |
|
470 | if (size & (PAGE_SIZE-1)) { | |
|
471 | size += PAGE_SIZE - (size & (PAGE_SIZE-1)); | |
|
472 | } | |
|
473 | return size; | |
|
474 | } | |
|
475 | ||
|
476 | uintptr_t virtToPhys(void* virt, int pagemapfd) { | |
|
477 | uintptr_t pgNum = (uintptr_t)(virt)/PAGE_SIZE; | |
|
478 | int byteOffsetFromPage = (uintptr_t)(virt)%PAGE_SIZE; | |
|
479 | uint64_t physPage; | |
|
480 | ///proc/self/pagemap is a uint64_t array where the index represents the virtual page number and the value at that index represents the physical page number. | |
|
481 | //So if virtual address is 0x1000000, read the value at *array* index 0x1000000/PAGE_SIZE and multiply that by PAGE_SIZE to get the physical address. | |
|
482 | //because files are bytestreams, one must explicitly multiply each byte index by 8 to treat it as a uint64_t array. | |
|
483 | int err = lseek(pagemapfd, pgNum*8, SEEK_SET); | |
|
484 | if (err != pgNum*8) { | |
|
485 | printf("WARNING: virtToPhys %p failed to seek (expected %i got %i. errno: %i)\n", virt, pgNum*8, err, errno); | |
|
486 | } | |
|
487 | read(pagemapfd, &physPage, 8); | |
|
488 | if (!physPage & (1ull<<63)) { //bit 63 is set to 1 if the page is present in ram | |
|
489 | printf("WARNING: virtToPhys %p has no physical address\n", virt); | |
|
490 | } | |
|
491 | physPage = physPage & ~(0x1ffull << 55); //bits 55-63 are flags. | |
|
492 | uintptr_t mapped = (uintptr_t)(physPage*PAGE_SIZE + byteOffsetFromPage); | |
|
493 | return mapped; | |
|
494 | } | |
|
495 | ||
|
496 | uintptr_t virtToUncachedPhys(void *virt, int pagemapfd) { | |
|
497 | return virtToPhys(virt, pagemapfd) | 0x40000000; //bus address of the ram is 0x40000000. With this binary-or, writes to the returned address will bypass the CPU (L1) cache, but not the L2 cache. 0xc0000000 should be the base address if L2 must also be bypassed. However, the DMA engine is aware of L2 cache - just not the L1 cache (source: http://en.wikibooks.org/wiki/Aros/Platforms/Arm_Raspberry_Pi_support#Framebuffer ) | |
|
498 | } | |
|
499 | ||
|
500 | ||
|
501 | //allocate some memory and lock it so that its physical address will never change | |
|
502 | void* makeLockedMem(size_t size) { | |
|
503 | //void* mem = valloc(size); //memory returned by valloc is not zero'd | |
|
504 | size = ceilToPage(size); | |
|
505 | void *mem = mmap( | |
|
506 | NULL, //let kernel place memory where it wants | |
|
507 | size, //length | |
|
508 | PROT_WRITE | PROT_READ, //ask for read and write permissions to memory | |
|
509 | MAP_SHARED | | |
|
510 | MAP_ANONYMOUS | //no underlying file; initialize to 0 | |
|
511 | MAP_NORESERVE | //don't reserve swap space | |
|
512 | MAP_LOCKED, //lock into *virtual* ram. Physical ram may still change! | |
|
513 | -1, // File descriptor | |
|
514 | 0); //no offset into file (file doesn't exist). | |
|
515 | if (mem == MAP_FAILED) { | |
|
516 | printf("makeLockedMem failed\n"); | |
|
517 | exit(1); | |
|
518 | } | |
|
519 | memset(mem, 0, size); //simultaneously zero the pages and force them into memory | |
|
520 | mlock(mem, size); | |
|
521 | return mem; | |
|
522 | } | |
|
523 | ||
|
524 | //free memory allocated with makeLockedMem | |
|
525 | void freeLockedMem(void* mem, size_t size) { | |
|
526 | size = ceilToPage(size); | |
|
527 | munlock(mem, size); | |
|
528 | munmap(mem, size); | |
|
529 | } | |
|
530 | ||
|
531 | void* makeUncachedMemView(void* virtaddr, size_t bytes, int memfd, int pagemapfd) { | |
|
532 | //by default, writing to any virtual address will go through the CPU cache. | |
|
533 | //this function will return a pointer that behaves the same as virtaddr, but bypasses the CPU L1 cache (note that because of this, the returned pointer and original pointer should not be used in conjunction, else cache-related inconsistencies will arise) | |
|
534 | //Note: The original memory should not be unmapped during the lifetime of the uncached version, as then the OS won't know that our process still owns the physical memory. | |
|
535 | bytes = ceilToPage(bytes); | |
|
536 | //first, just allocate enough *virtual* memory for the operation. This is done so that we can do the later mapping to a contiguous range of virtual memory: | |
|
537 | void *mem = mmap( | |
|
538 | NULL, //let kernel place memory where it wants | |
|
539 | bytes, //length | |
|
540 | PROT_WRITE | PROT_READ, //ask for read and write permissions to memory | |
|
541 | MAP_SHARED | | |
|
542 | MAP_ANONYMOUS | //no underlying file; initialize to 0 | |
|
543 | MAP_NORESERVE | //don't reserve swap space | |
|
544 | MAP_LOCKED, //lock into *virtual* ram. Physical ram may still change! | |
|
545 | -1, // File descriptor | |
|
546 | 0); //no offset into file (file doesn't exist). | |
|
547 | //now, free the virtual memory and immediately remap it to the physical addresses used in virtaddr | |
|
548 | munmap(mem, bytes); //Might not be necessary; MAP_FIXED indicates it can map an already-used page | |
|
549 | for (int offset=0; offset<bytes; offset += PAGE_SIZE) { | |
|
550 | void *mappedPage = mmap(mem+offset, PAGE_SIZE, PROT_WRITE|PROT_READ, MAP_SHARED|MAP_FIXED|MAP_NORESERVE|MAP_LOCKED, memfd, virtToUncachedPhys(virtaddr+offset, pagemapfd)); | |
|
551 | if (mappedPage != mem+offset) { //We need these mappings to be contiguous over virtual memory (in order to replicate the virtaddr array), so we must ensure that the address we requested from mmap was actually used. | |
|
552 | printf("Failed to create an uncached view of memory at addr %p+0x%08x\n", virtaddr, offset); | |
|
553 | exit(1); | |
|
554 | } | |
|
555 | } | |
|
556 | memset(mem, 0, bytes); //Although the cached version might have been reset, those writes might not have made it through. | |
|
557 | return mem; | |
|
558 | } | |
|
559 | ||
|
560 | //free memory allocated with makeLockedMem | |
|
561 | void freeUncachedMemView(void* mem, size_t size) { | |
|
562 | size = ceilToPage(size); | |
|
563 | munmap(mem, size); | |
|
564 | } | |
|
565 | ||
|
566 | //map a physical address into our virtual address space. memfd is the file descriptor for /dev/mem | |
|
567 | volatile uint32_t* mapPeripheral(int memfd, int addr) { | |
|
568 | ///dev/mem behaves as a file. We need to map that file into memory: | |
|
569 | //NULL = virtual address of mapping is chosen by kernel. | |
|
570 | //PAGE_SIZE = map 1 page. | |
|
571 | //PROT_READ|PROT_WRITE means give us read and write priveliges to the memory | |
|
572 | //MAP_SHARED means updates to the mapped memory should be written back to the file & shared with other processes | |
|
573 | //memfd = /dev/mem file descriptor | |
|
574 | //addr = offset in file to map | |
|
575 | void *mapped = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, memfd, addr); | |
|
576 | //now, *mapped = memory at physical address of addr. | |
|
577 | if (mapped == MAP_FAILED) { | |
|
578 | printf("failed to map memory (did you remember to run as root?)\n"); | |
|
579 | exit(1); | |
|
580 | } else { | |
|
581 | printf("mapped: %p\n", mapped); | |
|
582 | } | |
|
583 | return (volatile uint32_t*)mapped; | |
|
584 | } | |
|
585 | ||
|
586 | ||
|
587 | void cleanup() { | |
|
588 | printf("Cleanup\n"); | |
|
589 | //disable DMA. Otherwise, it will continue to run in the background, potentially overwriting future user data. | |
|
590 | if(dmaHeader) { | |
|
591 | writeBitmasked(&dmaHeader->CS, DMA_CS_ACTIVE, 0); | |
|
592 | usleep(100); | |
|
593 | writeBitmasked(&dmaHeader->CS, DMA_CS_RESET, DMA_CS_RESET); | |
|
594 | } | |
|
595 | //could also disable PWM, but that's not imperative. | |
|
596 | } | |
|
597 | ||
|
598 | void cleanupAndExit(int sig) { | |
|
599 | cleanup(); | |
|
600 | printf("Exiting with error; caught signal: %i\n", sig); | |
|
601 | exit(1); | |
|
602 | } | |
|
603 | ||
|
604 | void sleepUntilMicros(uint64_t micros, volatile uint32_t* timerBaseMem) { | |
|
605 | //Note: cannot use clock_nanosleep with an absolute time, as the process clock may differ from the RPi clock. | |
|
606 | //this function doesn't need to be super precise, so we can tolerate interrupts. | |
|
607 | //Therefore, we can use a relative sleep: | |
|
608 | uint64_t cur = readSysTime(timerBaseMem); | |
|
609 | if (micros > cur) { //avoid overflow caused by unsigned arithmetic | |
|
610 | uint64_t dur = micros - cur; | |
|
611 | //usleep(dur); //nope, causes problems! | |
|
612 | struct timespec t; | |
|
613 | t.tv_sec = dur/1000000; | |
|
614 | t.tv_nsec = (dur - t.tv_sec*1000000)*1000; | |
|
615 | nanosleep(&t, NULL); | |
|
616 | } | |
|
617 | } | |
|
618 | ||
|
619 | ||
|
620 | //int64_t _lastTimeAtFrame0; | |
|
621 | ||
|
622 | void queue(int pin, int mode, uint64_t micros, struct GpioBufferFrame* srcArray, volatile uint32_t* timerBaseMem, struct DmaChannelHeader* dmaHeader) { | |
|
623 | //This function takes a pin, a mode (0=off, 1=on) and a time. It then manipulates the GpioBufferFrame array in order to ensure that the pin switches to the desired level at the desired time. It will sleep if necessary. | |
|
624 | //Sleep until we are on the right iteration of the circular buffer (otherwise we cannot queue the command) | |
|
625 | uint64_t callTime = readSysTime(timerBaseMem); //only used for debugging | |
|
626 | uint64_t desiredTime = micros - FRAME_TO_USEC(SOURCE_BUFFER_FRAMES); | |
|
627 | sleepUntilMicros(desiredTime, timerBaseMem); | |
|
628 | uint64_t awakeTime = readSysTime(timerBaseMem); //only used for debugging | |
|
629 | ||
|
630 | //get the current source index at the current time: | |
|
631 | //must ensure we aren't interrupted during this calculation, hence the two timers instead of 1. | |
|
632 | //Note: getting the curTime & srcIdx don't have to be done for every call to queue - it could be done eg just once per buffer. | |
|
633 | // It should be calculated regularly though, to counter clock drift & PWM FIFO underflows | |
|
634 | // It is done in this function only for simplicity | |
|
635 | int srcIdx; | |
|
636 | uint64_t curTime1, curTime2; | |
|
637 | int tries=0; | |
|
638 | do { | |
|
639 | curTime1 = readSysTime(timerBaseMem); | |
|
640 | srcIdx = dmaHeader->STRIDE; //the source index is stored in the otherwise-unused STRIDE register, for efficiency | |
|
641 | curTime2 = readSysTime(timerBaseMem); | |
|
642 | ++tries; | |
|
643 | } while (curTime2-curTime1 > 1 || (srcIdx & DMA_CB_TXFR_YLENGTH_MASK)); //allow 1 uS variability. | |
|
644 | //Uncomment the following lines and the above declaration of _lastTimeAtFrame0 to log jitter information: | |
|
645 | //int64_t curTimeAtFrame0 = curTime2 - FRAME_TO_USEC(srcIdx); | |
|
646 | //printf("Timing diff: %lli\n", (curTimeAtFrame0-_lastTimeAtFrame0)%FRAME_TO_USEC(SOURCE_BUFFER_FRAMES)); | |
|
647 | //_lastTimeAtFrame0 = curTimeAtFrame0; | |
|
648 | //if timing diff is positive, then then curTimeAtFrame0 > _lastTimeAtFrame0 | |
|
649 | //curTime2 - srcIdx2 > curTime1 - srcIdx1 | |
|
650 | //curTime2 - curTime2 > srcIdx2 - srcIdx1 | |
|
651 | //more uS have elapsed than frames; DMA cannot keep up | |
|
652 | ||
|
653 | //calculate the frame# at which to place the event: | |
|
654 | int usecFromNow = micros - curTime2; | |
|
655 | int framesFromNow = USEC_TO_FRAME(usecFromNow); | |
|
656 | if (framesFromNow < 10) { //Not safe to schedule less than ~10uS into the future (note: should be operating on usecFromNow, not framesFromNow) | |
|
657 | printf("Warning: behind schedule: %i (%i uSec) (tries: %i) (sleep %llu -> %llu (wanted %llu))\n", framesFromNow, usecFromNow, tries, callTime, awakeTime, desiredTime); | |
|
658 | framesFromNow = 10; | |
|
659 | } | |
|
660 | int newIdx = (srcIdx + framesFromNow)%SOURCE_BUFFER_FRAMES; | |
|
661 | //Now queue the command: | |
|
662 | if (mode == 0) { //turn output off | |
|
663 | srcArray[newIdx].gpclr[pin>31] |= 1 << (pin%32); | |
|
664 | } else { //turn output on | |
|
665 | srcArray[newIdx].gpset[pin>31] |= 1 << (pin%32); | |
|
666 | } | |
|
667 | } | |
|
668 | ||
|
669 | int main() { | |
|
670 | volatile uint32_t *gpioBaseMem, *dmaBaseMem, *pwmBaseMem, *timerBaseMem, *clockBaseMem; | |
|
671 | //emergency clean-up: | |
|
672 | for (int i = 0; i < 64; i++) { //catch all signals (like ctrl+c, ctrl+z, ...) to ensure DMA is disabled | |
|
673 | struct sigaction sa; | |
|
674 | memset(&sa, 0, sizeof(sa)); | |
|
675 | sa.sa_handler = cleanupAndExit; | |
|
676 | sigaction(i, &sa, NULL); | |
|
677 | } | |
|
678 | setSchedPriority(SCHED_PRIORITY); | |
|
679 | //First, open the linux device, /dev/mem | |
|
680 | //dev/mem provides access to the physical memory of the entire processor+ram | |
|
681 | //This is needed because Linux uses virtual memory, thus the process's memory at 0x00000000 will NOT have the same contents as the physical memory at 0x00000000 | |
|
682 | int memfd = open("/dev/mem", O_RDWR | O_SYNC); | |
|
683 | if (memfd < 0) { | |
|
684 | printf("Failed to open /dev/mem (did you remember to run as root?)\n"); | |
|
685 | exit(1); | |
|
686 | } | |
|
687 | int pagemapfd = open("/proc/self/pagemap", O_RDONLY); | |
|
688 | //now map /dev/mem into memory, but only map specific peripheral sections: | |
|
689 | gpioBaseMem = mapPeripheral(memfd, GPIO_BASE); | |
|
690 | dmaBaseMem = mapPeripheral(memfd, DMA_BASE); | |
|
691 | pwmBaseMem = mapPeripheral(memfd, PWM_BASE); | |
|
692 | timerBaseMem = mapPeripheral(memfd, TIMER_BASE); | |
|
693 | clockBaseMem = mapPeripheral(memfd, CLOCK_BASE); | |
|
694 | ||
|
695 | int outPin = 10; | |
|
696 | //now set our pin as an output: | |
|
697 | volatile uint32_t *fselAddr = (volatile uint32_t*)(gpioBaseMem + GPFSEL0/4 + outPin/10); | |
|
698 | writeBitmasked(fselAddr, 0x7 << (3*(outPin%10)), 0x1 << (3*(outPin%10))); | |
|
699 | //Note: PWM pacing still works, even with no physical outputs, so we don't need to set gpio pin 18 to its alternate function. | |
|
700 | ||
|
701 | //Often need to copy zeros with DMA. This array can be the source. Needs to all lie on one page | |
|
702 | void *zerosPageCached = makeLockedMem(PAGE_SIZE); | |
|
703 | void *zerosPage = makeUncachedMemView(zerosPageCached, PAGE_SIZE, memfd, pagemapfd); | |
|
704 | ||
|
705 | //configure DMA... | |
|
706 | //First, allocate memory for the source: | |
|
707 | size_t numSrcBlocks = SOURCE_BUFFER_FRAMES; //We want apx 1M blocks/sec. | |
|
708 | size_t srcPageBytes = numSrcBlocks*sizeof(struct GpioBufferFrame); | |
|
709 | void *virtSrcPageCached = makeLockedMem(srcPageBytes); | |
|
710 | void *virtSrcPage = makeUncachedMemView(virtSrcPageCached, srcPageBytes, memfd, pagemapfd); | |
|
711 | printf("mappedPhysSrcPage: %p\n", virtToPhys(virtSrcPage, pagemapfd)); | |
|
712 | ||
|
713 | //cast virtSrcPage to a GpioBufferFrame array: | |
|
714 | struct GpioBufferFrame *srcArray = (struct GpioBufferFrame*)virtSrcPage; //Note: calling virtToPhys on srcArray will return NULL. Use srcArrayCached for that. | |
|
715 | struct GpioBufferFrame *srcArrayCached = (struct GpioBufferFrame*)virtSrcPageCached; | |
|
716 | //srcArray[0].gpset[0] = (1 << outPin); //set pin ON | |
|
717 | //srcArray[numSrcBlocks/2].gpclr[0] = (1 << outPin); //set pin OFF; | |
|
718 | ||
|
719 | //configure PWM clock: | |
|
720 | *(clockBaseMem + CM_PWMCTL/4) = CM_PWMCTL_PASSWD | ((*(clockBaseMem + CM_PWMCTL/4))&(~CM_PWMCTL_ENAB)); //disable clock | |
|
721 | do {} while (*(clockBaseMem + CM_PWMCTL/4) & CM_PWMCTL_BUSY); //wait for clock to deactivate | |
|
722 | *(clockBaseMem + CM_PWMDIV/4) = CM_PWMDIV_PASSWD | CM_PWMDIV_DIVI(CLOCK_DIV); //configure clock divider (running at 500MHz undivided) | |
|
723 | *(clockBaseMem + CM_PWMCTL/4) = CM_PWMCTL_PASSWD | CM_PWMCTL_SRC_PLLD; //source 500MHz base clock, no MASH. | |
|
724 | *(clockBaseMem + CM_PWMCTL/4) = CM_PWMCTL_PASSWD | CM_PWMCTL_SRC_PLLD | CM_PWMCTL_ENAB; //enable clock | |
|
725 | do {} while (*(clockBaseMem + CM_PWMCTL/4) & CM_PWMCTL_BUSY == 0); //wait for clock to activate | |
|
726 | ||
|
727 | //configure rest of PWM: | |
|
728 | struct PwmHeader *pwmHeader = (struct PwmHeader*)(pwmBaseMem); | |
|
729 | ||
|
730 | pwmHeader->DMAC = 0; //disable DMA | |
|
731 | pwmHeader->CTL |= PWM_CTL_CLRFIFO; //clear pwm | |
|
732 | usleep(100); | |
|
733 | ||
|
734 | pwmHeader->STA = PWM_STA_ERRS; //clear PWM errors | |
|
735 | usleep(100); | |
|
736 | ||
|
737 | pwmHeader->DMAC = PWM_DMAC_EN | PWM_DMAC_DREQ(PWM_FIFO_SIZE) | PWM_DMAC_PANIC(PWM_FIFO_SIZE); //DREQ is activated at queue < PWM_FIFO_SIZE | |
|
738 | pwmHeader->RNG1 = BITS_PER_CLOCK; //used only for timing purposes; #writes to PWM FIFO/sec = PWM CLOCK / RNG1 | |
|
739 | pwmHeader->CTL = PWM_CTL_REPEATEMPTY1 | PWM_CTL_ENABLE1 | PWM_CTL_USEFIFO1; | |
|
740 | ||
|
741 | //allocate memory for the control blocks | |
|
742 | size_t cbPageBytes = numSrcBlocks * sizeof(struct DmaControlBlock) * 3; //3 cbs for each source block | |
|
743 | void *virtCbPageCached = makeLockedMem(cbPageBytes); | |
|
744 | void *virtCbPage = makeUncachedMemView(virtCbPageCached, cbPageBytes, memfd, pagemapfd); | |
|
745 | //fill the control blocks: | |
|
746 | struct DmaControlBlock *cbArrCached = (struct DmaControlBlock*)virtCbPageCached; | |
|
747 | struct DmaControlBlock *cbArr = (struct DmaControlBlock*)virtCbPage; | |
|
748 | printf("#dma blocks: %i, #src blocks: %i\n", numSrcBlocks*3, numSrcBlocks); | |
|
749 | for (int i=0; i<numSrcBlocks*3; i += 3) { | |
|
750 | //pace DMA through PWM | |
|
751 | cbArr[i].TI = DMA_CB_TI_PERMAP_PWM | DMA_CB_TI_DEST_DREQ | DMA_CB_TI_NO_WIDE_BURSTS | DMA_CB_TI_TDMODE; | |
|
752 | cbArr[i].SOURCE_AD = virtToUncachedPhys(srcArrayCached + i/3, pagemapfd); //The data written doesn't matter, but using the GPIO source will hopefully bring it into L2 for more deterministic timing of the next control block. | |
|
753 | cbArr[i].DEST_AD = PWM_BASE_BUS + PWM_FIF1; //write to the FIFO | |
|
754 | cbArr[i].TXFR_LEN = DMA_CB_TXFR_LEN_YLENGTH(1) | DMA_CB_TXFR_LEN_XLENGTH(4); | |
|
755 | cbArr[i].STRIDE = i/3; | |
|
756 | cbArr[i].NEXTCONBK = virtToUncachedPhys(cbArrCached+i+1, pagemapfd); //have to use the cached version because the uncached version isn't listed in pagemap(?) | |
|
757 | //copy buffer to GPIOs | |
|
758 | cbArr[i+1].TI = DMA_CB_TI_SRC_INC | DMA_CB_TI_DEST_INC | DMA_CB_TI_NO_WIDE_BURSTS | DMA_CB_TI_TDMODE; | |
|
759 | cbArr[i+1].SOURCE_AD = virtToUncachedPhys(srcArrayCached + i/3, pagemapfd); | |
|
760 | cbArr[i+1].DEST_AD = GPIO_BASE_BUS + GPSET0; | |
|
761 | cbArr[i+1].TXFR_LEN = DMA_CB_TXFR_LEN_YLENGTH(2) | DMA_CB_TXFR_LEN_XLENGTH(8); | |
|
762 | cbArr[i+1].STRIDE = DMA_CB_STRIDE_D_STRIDE(4) | DMA_CB_STRIDE_S_STRIDE(0); | |
|
763 | cbArr[i+1].NEXTCONBK = virtToUncachedPhys(cbArrCached+i+2, pagemapfd); | |
|
764 | //clear buffer (TODO: investigate using a 4-word copy ("burst") ) | |
|
765 | cbArr[i+2].TI = DMA_CB_TI_DEST_INC | DMA_CB_TI_NO_WIDE_BURSTS | DMA_CB_TI_TDMODE; | |
|
766 | cbArr[i+2].SOURCE_AD = virtToUncachedPhys(zerosPageCached, pagemapfd); | |
|
767 | cbArr[i+2].DEST_AD = virtToUncachedPhys(srcArrayCached + i/3, pagemapfd); | |
|
768 | cbArr[i+2].TXFR_LEN = DMA_CB_TXFR_LEN_YLENGTH(1) | DMA_CB_TXFR_LEN_XLENGTH(sizeof(struct GpioBufferFrame)); | |
|
769 | cbArr[i+2].STRIDE = i/3; //might be better to use the NEXT index | |
|
770 | int nextIdx = i+3 < numSrcBlocks*3 ? i+3 : 0; //last block should loop back to the first block | |
|
771 | cbArr[i+2].NEXTCONBK = virtToUncachedPhys(cbArrCached + nextIdx, pagemapfd); //(uint32_t)physCbPage + ((void*)&cbArr[(i+2)%maxIdx] - virtCbPage); | |
|
772 | } | |
|
773 | for (int i=0; i<cbPageBytes; i+=PAGE_SIZE) { | |
|
774 | printf("virt cb[%i] -> phys: 0x%08x (0x%08x)\n", i, virtToPhys(i+(void*)cbArrCached, pagemapfd), virtToUncachedPhys(i+(void*)cbArrCached, pagemapfd)); | |
|
775 | } | |
|
776 | //source: http://virtualfloppy.blogspot.com/2014/01/dma-support-at-last.html | |
|
777 | //cat /sys/module/dma/parameters/dmachans gives a bitmask of DMA channels that are not used by GPU. Results: ch 1, 3, 6, 7 are reserved. | |
|
778 | //dmesg | grep "DMA"; results: Ch 2 is used by SDHC host | |
|
779 | //ch 0 is known to be used for graphics acceleration | |
|
780 | //Thus, applications can use ch 4, 5, or the LITE channels @ 8 and beyond. | |
|
781 | //If using LITE channels, then we can't use the STRIDE feature, so that narrows it down to ch 4 and ch 5. | |
|
782 | int dmaCh = 5; | |
|
783 | //enable DMA channel (it's probably already enabled, but we want to be sure): | |
|
784 | writeBitmasked(dmaBaseMem + DMAENABLE, 1 << dmaCh, 1 << dmaCh); | |
|
785 | ||
|
786 | //configure the DMA header to point to our control block: | |
|
787 | dmaHeader = (struct DmaChannelHeader*)(dmaBaseMem + DMACH(dmaCh)/4); //must divide by 4, as dmaBaseMem is uint32_t* | |
|
788 | printf("Previous DMA header:\n"); | |
|
789 | logDmaChannelHeader(dmaHeader); | |
|
790 | //abort any previous DMA: | |
|
791 | //dmaHeader->NEXTCONBK = 0; //NEXTCONBK is read-only. | |
|
792 | dmaHeader->CS |= DMA_CS_ABORT; //make sure to disable dma first. | |
|
793 | usleep(100); //give time for the abort command to be handled. | |
|
794 | ||
|
795 | dmaHeader->CS = DMA_CS_RESET; | |
|
796 | usleep(100); | |
|
797 | ||
|
798 | writeBitmasked(&dmaHeader->CS, DMA_CS_END, DMA_CS_END); //clear the end flag | |
|
799 | dmaHeader->DEBUG = DMA_DEBUG_READ_ERROR | DMA_DEBUG_FIFO_ERROR | DMA_DEBUG_READ_LAST_NOT_SET_ERROR; // clear debug error flags | |
|
800 | uint32_t firstAddr = virtToUncachedPhys(cbArrCached, pagemapfd); | |
|
801 | printf("starting DMA @ CONBLK_AD=0x%08x\n", firstAddr); | |
|
802 | dmaHeader->CONBLK_AD = firstAddr; //(uint32_t)physCbPage + ((void*)cbArr - virtCbPage); //we have to point it to the PHYSICAL address of the control block (cb1) | |
|
803 | dmaHeader->CS = DMA_CS_PRIORITY(7) | DMA_CS_PANIC_PRIORITY(7) | DMA_CS_DISDEBUG; //high priority (max is 7) | |
|
804 | dmaHeader->CS = DMA_CS_PRIORITY(7) | DMA_CS_PANIC_PRIORITY(7) | DMA_CS_DISDEBUG | DMA_CS_ACTIVE; //activate DMA. | |
|
805 | ||
|
806 | uint64_t startTime = readSysTime(timerBaseMem); | |
|
807 | printf("DMA Active @ %llu uSec\n", startTime); | |
|
808 | /*while (dmaHeader->CS & DMA_CS_ACTIVE) { | |
|
809 | logDmaChannelHeader(dmaHeader); | |
|
810 | } //wait for DMA transfer to complete.*/ | |
|
811 | for (int i=1; ; ++i) { //generate the output sequence: | |
|
812 | //logDmaChannelHeader(dmaHeader); | |
|
813 | //this just toggles outPin every few us: | |
|
814 | queue(outPin, i%2, startTime + 1000*i, srcArray, timerBaseMem, dmaHeader); | |
|
815 | } | |
|
816 | //Exit routine: | |
|
817 | cleanup(); | |
|
818 | printf("Exiting cleanly:\n"); | |
|
819 | freeUncachedMemView(virtCbPage, cbPageBytes); | |
|
820 | freeLockedMem(virtCbPageCached, cbPageBytes); | |
|
821 | freeUncachedMemView(virtSrcPage, srcPageBytes); | |
|
822 | freeLockedMem(virtSrcPageCached, srcPageBytes); | |
|
823 | freeUncachedMemView(zerosPage, PAGE_SIZE); | |
|
824 | freeLockedMem(zerosPageCached, PAGE_SIZE); | |
|
825 | close(pagemapfd); | |
|
826 | close(memfd); | |
|
827 | return 0; | |
|
828 | } | |
|
829 |
General Comments 0
You need to be logged in to leave comments.
Login now