Parameterübergabe bei Inline-Assembler [Archiv]

Archiv verlassen und diese Seite im Standarddesign anzeigen : Parameterübergabe bei Inline-Assembler

ogni42

28.03.2006, 14:23

Hallo Forum,

habe angefangen mit inline Assembler zu arbeiten und stosse schon auf die ersten Probleme:

#include <inttypes.h>

uint16_t fadd(uint8_t op1, uint8_t op2)
{
uint16_t result;
// op1 in 16bit register laden
// op2 mit carry aufaddieren und Ergebnis zurück geben
asm volatile (
"lds %A0, %1" "\n\t"
"add %A0, %2" "\n\t"
"adc %B0, 0"
: "=&r" (result)
: "r" (op1), "r" (op2)
);

return result;
}

int main(void)
{

uint8_t op1 = 10;
uint8_t op2 = 250;

uint16_t result = 0;

result = fadd( op1, op2 );

return 0;
}

Nach meinem Verständnis der Tutorials zu inline asm, die ich durchgearbeitet habe, sollte das reichen, um die beiden 8bit Werte mit carry zu addieren und das Ergebnis in die Variable result zurück zu schreiben. Leider meldet mir der Compiler das:

rm -rf scratch.o scratch.elf dep/* scratch.hex scratch.eep scratch.lss scratch.map
Build succeeded with 0 Warnings...
avr-gcc.exe -mmcu=atmega168 -Wall -gdwarf-2 -std=gnu99 -DF_CPU=18432000UL -Os -fsigned-char -MD -MP -MT scratch.o -MF dep/scratch.o.d -c ../scratch.c
avr-gcc.exe -mmcu=atmega168 -Wl,-Map=scratch.map scratch.o -o scratch.elf
scratch.o(.text+0x2): In function `fadd':
../scratch.c:7: undefined reference to `r24'
make: *** [scratch.elf] Error 1
Build failed with 1 errors and 0 warnings...

Der asm Code (.S File) sieht so aus:

.file "scratch.c"
.arch atmega168
__SREG__ = 0x3f
__SP_H__ = 0x3e
__SP_L__ = 0x3d
__tmp_reg__ = 0
__zero_reg__ = 1
.global __do_copy_data
.global __do_clear_bss
.section .debug_abbrev,"",@progbits
.Ldebug_abbrev0:
.section .debug_info,"",@progbits
.Ldebug_info0:
.section .debug_line,"",@progbits
.Ldebug_line0:
.text
.Ltext0:
.global fadd
.type fadd, @function
fadd:
.LFB2:
.LM1:
/* prologue: frame size=4 */
push r28
push r29
in r28,__SP_L__
in r29,__SP_H__
sbiw r28,4
in __tmp_reg__,__SREG__
cli
out __SP_H__,r29
out __SREG__,__tmp_reg__
out __SP_L__,r28
/* prologue end (size=10) */
std Y+1,r24
std Y+2,r22
.LM2:
ldd r25,Y+1
ldd r24,Y+2
/* #APP */
lds r24, r25
add r24, r24
adc r25, 0
/* #NOAPP */
std Y+3,r24
std Y+4,r25
.LM3:
ldd r24,Y+3
ldd r25,Y+4
/* epilogue: frame size=4 */
adiw r28,4
in __tmp_reg__,__SREG__
cli
out __SP_H__,r29
out __SREG__,__tmp_reg__
out __SP_L__,r28
pop r29
pop r28
ret
/* epilogue end (size=9) */
/* function fadd size 33 (14) */
.LFE2:
.size fadd, .-fadd
.global main
.type main, @function
main:
.LFB3:
.LM4:
/* prologue: frame size=4 */
ldi r28,lo8(__stack - 4)
ldi r29,hi8(__stack - 4)
out __SP_H__,r29
out __SP_L__,r28
/* prologue end (size=4) */
.LM5:
ldi r24,lo8(10)
std Y+1,r24
.LM6:
ldi r24,lo8(-6)
std Y+2,r24
.LM7:
std Y+3,__zero_reg__
std Y+4,__zero_reg__
.LM8:
ldd r22,Y+2
ldd r24,Y+1
call fadd
std Y+3,r24
std Y+4,r25
.LM9:
ldi r24,lo8(0)
ldi r25,hi8(0)
/* epilogue: frame size=4 */
jmp exit
/* epilogue end (size=2) */
/* function main size 20 (14) */
.LFE3:
.size main, .-main
.Letext0:
.section .debug_line
.long .LELT0-.LSLT0

.LSLT0:
.word 2

.long .LELTP0-.LASLTP0

.LASLTP0:
.byte 0x1
.byte 0x1
.byte 0xf6
.byte 0xf5
.byte 0xa
.byte 0x0
.byte 0x1
.byte 0x1
.byte 0x1
.byte 0x1
.byte 0x0
.byte 0x0
.byte 0x0
.byte 0x1
.ascii ".."
.byte 0
.ascii "C:/Programme/WinAVR/bin/../lib/gcc/avr/3.4.3/../../../../avr"
.ascii "/include"
.byte 0
.byte 0x0
.string "stdint.h"
.uleb128 0x2
.uleb128 0x0
.uleb128 0x0
.string "scratch.c"
.uleb128 0x1
.uleb128 0x0
.uleb128 0x0
.byte 0x0
.LELTP0:
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM1

.byte 0x4
.uleb128 0x2
.byte 0x17
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM2

.byte 0x17
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM3

.byte 0x1b
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM4

.byte 0x32
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM5

.byte 0x16
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM6

.byte 0x15
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM7

.byte 0x16
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM8

.byte 0x16
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .LM9

.byte 0x16
.byte 0x0
.uleb128 0x3
.byte 0x2
.word .Letext0

.byte 0x0
.uleb128 0x1
.byte 0x1
.LELT0:
.section .debug_info
.long 401

.word 2

.long .Ldebug_abbrev0

.byte 0x2
.uleb128 0x1
.long .Ldebug_line0

.word .Letext0

.word .Ltext0

.string "GNU C 3.4.3"
.byte 0x1
.string "../scratch.c"
.string "D:\\\\priv\\\\Tech\\\\atmel\\\\learning\\\\scratch \\\\default"
.uleb128 0x2
.string "signed char"
.byte 0x1
.byte 0x6
.uleb128 0x3
.string "uint8_t"
.byte 0x1
.byte 0x46
.long 126

.uleb128 0x2
.string "unsigned char"
.byte 0x1
.byte 0x8
.uleb128 0x2
.string "int"
.byte 0x2
.byte 0x5
.uleb128 0x3
.string "uint16_t"
.byte 0x1
.byte 0x6d
.long 166

.uleb128 0x2
.string "unsigned int"
.byte 0x2
.byte 0x7
.uleb128 0x2
.string "long int"
.byte 0x4
.byte 0x5
.uleb128 0x2
.string "long unsigned int"
.byte 0x4
.byte 0x7
.uleb128 0x2
.string "long long int"
.byte 0x8
.byte 0x5
.uleb128 0x2
.string "long long unsigned int"
.byte 0x8
.byte 0x7
.uleb128 0x4
.long 333

.byte 0x1
.string "fadd"
.byte 0x2
.byte 0x4
.byte 0x1
.long 150

.word .LFB2

.word .LFE2

.byte 0x6
.byte 0x6c
.byte 0x93
.uleb128 0x1
.byte 0x6d
.byte 0x93
.uleb128 0x1
.uleb128 0x5
.string "op1"
.byte 0x2
.byte 0x3
.long 111

.byte 0x2
.byte 0x91
.sleb128 1
.uleb128 0x5
.string "op2"
.byte 0x2
.byte 0x3
.long 111

.byte 0x2
.byte 0x91
.sleb128 2
.uleb128 0x6
.string "result"
.byte 0x2
.byte 0x5
.long 150

.byte 0x2
.byte 0x91
.sleb128 3
.byte 0x0
.uleb128 0x7
.byte 0x1
.string "main"
.byte 0x2
.byte 0x2c
.byte 0x1
.long 143

.word .LFB3

.word .LFE3

.byte 0x6
.byte 0x6c
.byte 0x93
.uleb128 0x1
.byte 0x6d
.byte 0x93
.uleb128 0x1
.uleb128 0x6
.string "op1"
.byte 0x2
.byte 0x2e
.long 111

.byte 0x2
.byte 0x91
.sleb128 1
.uleb128 0x6
.string "op2"
.byte 0x2
.byte 0x2f
.long 111

.byte 0x2
.byte 0x91
.sleb128 2
.uleb128 0x6
.string "result"
.byte 0x2
.byte 0x31
.long 150

.byte 0x2
.byte 0x91
.sleb128 3
.byte 0x0
.byte 0x0
.section .debug_abbrev
.uleb128 0x1
.uleb128 0x11
.byte 0x1
.uleb128 0x10
.uleb128 0x6
.uleb128 0x12
.uleb128 0x1
.uleb128 0x11
.uleb128 0x1
.uleb128 0x25
.uleb128 0x8
.uleb128 0x13
.uleb128 0xb
.uleb128 0x3
.uleb128 0x8
.uleb128 0x1b
.uleb128 0x8
.byte 0x0
.byte 0x0
.uleb128 0x2
.uleb128 0x24
.byte 0x0
.uleb128 0x3
.uleb128 0x8
.uleb128 0xb
.uleb128 0xb
.uleb128 0x3e
.uleb128 0xb
.byte 0x0
.byte 0x0
.uleb128 0x3
.uleb128 0x16
.byte 0x0
.uleb128 0x3
.uleb128 0x8
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0xb
.uleb128 0x49
.uleb128 0x13
.byte 0x0
.byte 0x0
.uleb128 0x4
.uleb128 0x2e
.byte 0x1
.uleb128 0x1
.uleb128 0x13
.uleb128 0x3f
.uleb128 0xc
.uleb128 0x3
.uleb128 0x8
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0xb
.uleb128 0x27
.uleb128 0xc
.uleb128 0x49
.uleb128 0x13
.uleb128 0x11
.uleb128 0x1
.uleb128 0x12
.uleb128 0x1
.uleb128 0x40
.uleb128 0xa
.byte 0x0
.byte 0x0
.uleb128 0x5
.uleb128 0x5
.byte 0x0
.uleb128 0x3
.uleb128 0x8
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0xb
.uleb128 0x49
.uleb128 0x13
.uleb128 0x2
.uleb128 0xa
.byte 0x0
.byte 0x0
.uleb128 0x6
.uleb128 0x34
.byte 0x0
.uleb128 0x3
.uleb128 0x8
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0xb
.uleb128 0x49
.uleb128 0x13
.uleb128 0x2
.uleb128 0xa
.byte 0x0
.byte 0x0
.uleb128 0x7
.uleb128 0x2e
.byte 0x1
.uleb128 0x3f
.uleb128 0xc
.uleb128 0x3
.uleb128 0x8
.uleb128 0x3a
.uleb128 0xb
.uleb128 0x3b
.uleb128 0xb
.uleb128 0x27
.uleb128 0xc
.uleb128 0x49
.uleb128 0x13
.uleb128 0x11
.uleb128 0x1
.uleb128 0x12
.uleb128 0x1
.uleb128 0x40
.uleb128 0xa
.byte 0x0
.byte 0x0
.byte 0x0
.section .debug_pubnames,"",@progbits
.long 32

.word 2

.long .Ldebug_info0

.long 405

.long 258

.string "fadd"
.long 333

.string "main"
.long 0

.section .debug_aranges,"",@progbits
.long 16

.word 2

.long .Ldebug_info0

.byte 0x2
.byte 0x0
.word .Ltext0

.word .Letext0-.Ltext0

.word 0

.word 0

/* File "../scratch.c": code 53 = 0x0035 ( 28), prologues 14, epilogues 11 */

Der erzeugte Code zu fadd sieht für meine Begriffe etwas befremdlich aus, da r24 zu sich selbst addiert wird? Hat da jemand eine Idee wo der Fehler liegt?

SprinterSB

28.03.2006, 14:40

So ganz versteh ich dein Template nicht. Die Operanden legst du nach "r", greifst aber via lds ... %1 drauf zu --> r24 wird also als Symbol betrachtet! nicht als Register, und folglich nicht gefunden (Linker meckert).

Wieso greifst du ins RAM, die Werte stehen doch in Registern?

SprinterSB

28.03.2006, 14:53

So was?
Da lege ich result und op1 ins gleiche reg (vermeidet kopieren)

#include <avr/io.h>

uint16_t fadd (uint8_t op1, uint8_t op2)
{
uint16_t result;
// op1 in 16bit register laden
// op2 mit carry aufaddieren und Ergebnis zurück geben

asm volatile (
"clr %B0" "\n\t"
"add %A0, %3" "\n\t"
"adc %B0, __zero_reg__"
: "=r" (result), "=r" (op1)
: "0" (op1), "r" (op2)
);

return result;
}

uint16_t result;

void main()
{
uint8_t op1 = 10;
uint8_t op2 = 250;

//uint16_t result = 0;

result = fadd( op1, op2 );
}

.file "random-leds.c"
.arch atmega8
__SREG__ = 0x3f
__SP_H__ = 0x3e
__SP_L__ = 0x3d
__tmp_reg__ = 0
__zero_reg__ = 1
.global __do_copy_data
.global __do_clear_bss
.text
.global fadd
.type fadd, @function
fadd:
/* prologue: frame size=0 */
/* prologue end (size=0) */
/* #APP */
clr r25
add r24, r22
adc r25, __zero_reg__
/* #NOAPP */
/* epilogue: frame size=0 */
ret
/* epilogue end (size=1) */
/* function fadd size 7 (6) */
.size fadd, .-fadd
.global main
.type main, @function
main:
/* prologue: frame size=0 */
ldi r28,lo8(__stack - 0)
ldi r29,hi8(__stack - 0)
out __SP_H__,r29
out __SP_L__,r28
/* prologue end (size=4) */
ldi r22,lo8(-6) ; 12 *movqi/2 [length = 1]
ldi r24,lo8(10) ; 13 *movqi/2 [length = 1]
rcall fadd ; 14 call_value_insn/3 [length = 1]
sts (result)+1,r25 ; 16 *movhi/3 [length = 4]
sts result,r24
/* epilogue: frame size=0 */
rjmp exit
/* epilogue end (size=1) */
/* function main size 12 (7) */
.size main, .-main
.global result
.global result
.section .bss
.type result, @object
.size result, 2
result:
.skip 2,0

::EDIT::

Mit result in eigenem Register sieht's so aus:

uint16_t fadd (uint8_t op1, uint8_t op2)
{
uint16_t result;
// op1 in 16bit register laden
// op2 mit carry aufaddieren und Ergebnis zurück geben

asm volatile (
"clr %B0" "\n\t"
"add %A0, %2" "\n\t"
"adc %B0, __zero_reg__"
: "=&r" (result)
: "0" (op1), "r" (op2)
);

return result;
}

fadd:
mov r18,r24 ; 30 *movqi/1 [length = 1]
/* #APP */
clr r19
add r18, r22
adc r19, __zero_reg__
/* #NOAPP */
movw r24,r18 ; 31 *movhi/1 [length = 1]
ret

ogni42

28.03.2006, 15:05

Super, vielen Dank. Habe vor lauter Bäumen den Wald nicht mehr gesehen und stand mir selbst im Weg :oops:

SprinterSB

28.03.2006, 15:21

BTW. Solch kurze Sequenzen fühlen sich als Inline-Funktion bzw.Makro wohler ;-)

#define fadd(op1,op2) \
({ \
uint16_t __result = (uint16_t) op1;\
\
asm volatile ( \
"; fadd(" #op1 ", " #op2 ")\n\t"\
"add %A0, %2" "\n\t" \
"adc %B0, __zero_reg__" \
: "=r" (__result) \
: "0" (__result), "r" (op2)\
); \
\
__result; \
})

.global main
.type main, @function
main:
/* prologue: frame size=0 */
ldi r28,lo8(__stack - 0)
ldi r29,hi8(__stack - 0)
out __SP_H__,r29
out __SP_L__,r28
/* prologue end (size=4) */
ldi r18,lo8(-6) ; 10 *movqi/2 [length = 1]
ldi r24,lo8(10) ; 14 *movhi/4 [length = 2]
ldi r25,hi8(10)
/* #APP */
; fadd(op1, op2)
add r24, r18
adc r25, __zero_reg__
/* #NOAPP */
sts (result)+1,r25 ; 17 *movhi/3 [length = 4]
sts result,r24
/* epilogue: frame size=0 */
rjmp exit

ogni42

28.03.2006, 15:37

Ich hatte auch vor, die Funktionen nachher - wenn alles funktioniert - inline zu machen. Hintergrund ist, dass ich eine Fixpunkt-Funktionssammlung erstellen möchte (habe leider nicht adäquates im Netz gefunden), die die FMULS Befehle verwendet, was m.E. zu recht schnellem Code bei Signalverarbeitungsaufgaben führen sollte.

Eigentlich reichen da 4 Funktionen:
fmul - Multiplikation zweier Fixpunktzahlen
fadd - Addition, mit Über-/Unterlaufbegrenzung
fsub - Subtraktion, mit Über-/Unterlaufbegrenzung
fmac - MultiplyAccumulate (3 Operanden) mit Über-/Unterlaufbegrenzung

Georg-Johann, weisst Du ob es sowas schon gibt? Man muss das Rad ja nicht neu erfinden.

SprinterSB

28.03.2006, 15:51

Eigentlich sollte das schon gut auf C-Ebene gehen (wenn auch nicht so Effizient wie per asm).

z.B. eine fix-Mul, die 2 Werte (interpretiert als unsigned mit 8 Bit Vor- und Nachkomma) multipliziert:

/*static inline*/ uint16_t fmul (uint16_t a, uint16_t b)
{
return (uint16_t) ((uint32_t) a*b >> 16);
}

.global fmul
.type fmul, @function
fmul:
/* prologue: frame size=0 */
/* prologue end (size=0) */
movw r18,r22 ; 4 *movhi/1 [length = 1]
clr r26 ; 12 zero_extendhisi2/1 [length = 2]
clr r27
clr r20 ; 13 zero_extendhisi2/1 [length = 2]
clr r21
movw r22,r24 ; 14 *movsi/1 [length = 2]
movw r24,r26
rcall __mulsi3 ; 16 *mulsi3_call [length = 1]
movw r26,r24 ; 17 *movsi/1 [length = 2]
movw r24,r22
movw r24,r26 ; 35 *lshrsi3_const/2 [length = 3]
clr r26
clr r27
/* epilogue: frame size=0 */
ret
/* epilogue end (size=1) */
/* function fmul size 14 (13) */
.size fmul, .-fmul

00000094 <__mulsi3>:
94: 62 9f mul r22, r18
96: d0 01 movw r26, r0
98: 73 9f mul r23, r19
9a: f0 01 movw r30, r0
9c: 82 9f mul r24, r18
9e: e0 0d add r30, r0
a0: f1 1d adc r31, r1
a2: 64 9f mul r22, r20
a4: e0 0d add r30, r0
a6: f1 1d adc r31, r1
a8: 92 9f mul r25, r18
aa: f0 0d add r31, r0
ac: 83 9f mul r24, r19
ae: f0 0d add r31, r0
b0: 74 9f mul r23, r20
b2: f0 0d add r31, r0
b4: 65 9f mul r22, r21
b6: f0 0d add r31, r0
b8: 99 27 eor r25, r25
ba: 72 9f mul r23, r18
bc: b0 0d add r27, r0
be: e1 1d adc r30, r1
c0: f9 1f adc r31, r25
c2: 63 9f mul r22, r19
c4: b0 0d add r27, r0
c6: e1 1d adc r30, r1
c8: f9 1f adc r31, r25
ca: bd 01 movw r22, r26
cc: cf 01 movw r24, r30
ce: 11 24 eor r1, r1
d0: 08 95 ret

Das ist schon mal deutlich effizienter als float (aber natürlich auch was anderes ;-))

Zum weiteren Optimieren kann man ja von dem Code ausgehen und weitere Kenntnise ausnutzen (z.B. welche Register in mulsi3 Anfangs 0 sind, das spart das Löschen, und man kann z.B. statt dessen __zero_reg__ draufaddieren.

ogni42

28.03.2006, 17:40

Ja, die Idee dahinter ist eben, gerade die vier wesentlichen Fixpunkt-Operationen (DIV kommt bei Signalverarbeitung eher selten vor) möglichst effizient zu implementieren.

Mal schauen, vielleicht bekomme ich es ja hin. Dann werde ich mal ein paar Benchmarks posten.

SprinterSB

29.03.2006, 09:07

Mit DIV ist's genauso einfach auf long bzw. unsigned long abzubilden. Man muss sich einfach nur überlegen, wo das Komma steht, und wieviel man es im Ergebnis bzw. in der Eingabe verschieben muss.

Bei MUL von 8.8 * 8.8 bekommt man ein 16.16, muss dann also 8 nach rechts schieben (und nicht 16, wie ich ober falsch geschrieben hatte :oops:)

ogni42

29.03.2006, 09:43

So wie ich es bisher kennen gelern habe ist die Idee hinter FP (in der Signalverarbeitung) alle zahlen als zahlen im Bereich [-1, [1 zu Interpretieren (so wie es der FMULS-Assembler Befehl ebenfalls annimt). Dann ist das Ergebnis einer Multiplikation immer im richtigen Wertebereich (ausser bei -1, aber das kann man abfangen).

Bei MAC (MultiplyAccumulate) muss dann nur noch bei der Addition auf Bereichsüberläufe getestet werden und man spart in jedem Fall das Normalisieren der Zahlen.

Im Moment werde ich es so implementieren (zumindest Versuchen, so zu implementieren :) ), dass Überläufe geclippt werden, sprich: im Fall eines Überlaufs werden die Grenzwerte des Wertebereichs eingesetzt.

Es soll drei Sätze von Funktionen geben:
(i) 8bit -> 8bit
(ii) 8bit -> 16bit
(iii) 16bit -> 16bit

mit jeweils: fpadd, fpsub, fpmul und fpmac

ogni42

29.03.2006, 12:49

Hier mal ein schöner Vergleich:

uint16_t fmac16(uint16_t op1, uint16_t op2, uint16_t op3)
{
uint16_t result;

// range checkto prevent overflow. It is necessary that at least one
// operand's high byte is != 0x80
op1 = (op1 != 0x8000)
? (op2 = (op2 != 0x80) ? op2 : 0x81), op1
: 0x8100;

asm volatile (
"fmuls %B1, %B2" "\n\t" // take high byte of op1 and op2 only
"add r0, %A3" "\n\t" // do a 16 bit add
"adc r1, %B3" "\n\t"
"movw %0, r0" "\n\t"
"brvc 0f" "\n\t" // check for overflow (pos/neg) @see fadd8
"ldi %A0, 0xff" "\n\t"
"ldi %B0, 0x7f" "\n\t"
"brcc 0f" "\n\t"
"ldi %A0, 0x00" "\n\t"
"ldi %B0, 0x80" "\n\t"
"0:" "\n\t"
: "=&a" (result)
: "a" (op1), "a" (op2), "a" (op3)
: "r0", "r1"
);

return result;
}

Für die eigentliche MAC Operation werde hier nur 5 Takte benötigt (fmul, add, adc und movw).

Eine direkte Implementierung in C

uint16_t op161;
uint16_t op162;
uint16_t op163;
uint16_t result16;

op161 = 0x8000;
op162 = 0x8000;
op163 = 0x7000;

result16 = op161*op162+op163; // <- Das hier

Ist als Assembler Code schon erheblich länger:

mul r18,r24
movw r20,r0
mul r18,r25
add r21,r0
mul r19,r24
add r21,r0
clr r1
movw r18,r20
ldd r24,Y+9
ldd r25,Y+10
add r24,r18
adc r25,r19

und benötigt 17 Takte, wobei die +/- Überlaufprüfung nochmal erheblich komplexer wäre.

SprinterSB

29.03.2006, 14:40

Ein Fehler ist noch drinne: R1 muss nach einer Funktion/Assembler-Schnippsel unverändert vorliegen, denn avr-gcc hält immer 0 in diesem Register!

Wenn du es also veränderst, musst du die 0 wieder herstellen, sonst bekommst du an ganz anderer Stelle einen Fehler.

ogni42

29.03.2006, 15:08

Super, danke für den Hinweis. Bevor ich mich jetzt wirder durchs .S File quäle: Reicht es nicht, r1 in der Clobber Liste anzugeben?

SprinterSB

29.03.2006, 15:11

Nein, das genügt nicht. Sonst hätt ich nix dazu geschrieben ;-)
Aus den Clobbers kann es also raus. R0 ebenfalls, da wird eh von ausgegangen, daß das nichts überlebt.

ogni42

29.03.2006, 15:13

OK, also push r1, pop r1 und gut ist.

ogni42

29.03.2006, 15:14

Was ist dann der Effekt, wenn ich ein Register (z.B. r1) in der Clobber Liste angebe?

SprinterSB

29.03.2006, 15:24

AFAIK sind R0 und R1 fided regs, werden also nicht vom Compiler verwendet.
Um genau zu sein: Der Compiler reloadet z.B. keine Variablen in diese Register oder verwendet sie für Funktionsargumente oder zum temporären Speichern. Dennoch werden sie im avr-gcc-Backend verwendet, etwa wenn ein int gegen eine Konstande kleiner als 256 verglichen wird (Das highbyte wird dann mit cpc __zero_reg__ verglichen).
Der Compiler selbst hat aber auf interner Ebene keine "Vorstellung" von diesen Registern, sondern dieser Verwendung als temp bzw. 0-Register geschehen implizit durch die Backend-Implementierung.

Alles klar? Wahrscheinlich noch verwirrter als vorher...

Register > 1 in der Clobber-Liste haben hatürlich einen Effekt. Mach mal eine Funktion und clobbere r2 und schau den Code an.

ogni42

29.03.2006, 15:27

Danke jetzt ist klar. Habe r1 am Anfang der mul auf den Stack gelegt und nacher wieder restauriert. Sind halt zwei Takte mehr.

SprinterSB

29.03.2006, 15:32

Ein clr braucht einen Takt, während push/pop (Speicherzugriff!) 4 Takte braucht. Zudem braucht es 1 Byte vom Stack und das doppelte an Flash...

Einfach
clr r1
oder
clr __zero_reg__

ogni42

29.03.2006, 16:14

Ah bestens! Danke.

SprinterSB

30.03.2006, 13:10

...Ist als Assembler Code schon erheblich länger...
Hat es einen besonderen Grund, daß du avr-gcc deinen C-Code nicht optimieren lässt (zumindest sieht der erzeugte asm stark danach aus)?

ogni42

30.03.2006, 13:32

Doch, das lässt er natürlich :) Habe den C code mal in eine eigene Funktion gepackt:

uint16_t test( uint16_t op1, uint16_t op2, uint16_t op3)
{
uint16_t result;

result = op1*op2+op3; // <- Das hier

return result;
}

Da kommt bei mir mit -O2 folgende Assemblercode raus:

test:
.LFB3:
.LM7:
/* prologue: frame size=0 */
/* prologue end (size=0) */
movw r18,r22
.LM8:
mul r24,r18
movw r22,r0
mul r24,r19
add r23,r0
mul r25,r18
add r23,r0
clr r1
movw r24,r22
.LM9:
add r24,r20
adc r25,r21
/* epilogue: frame size=0 */
ret
/* epilogue end (size=1) */

Ist also immer noch erheblich langsamer als die Fixpunktimplementierung. Kommt hinzu, dass, wie bereits gesagt, das Clipping in C nicht so einfach zu realisieren ist.