Z88dk unusual behaviour

If you like transforming your statements into code, this is the place for you

Moderator: Programming Moderators

seedy1812
Posts: 91
Joined: Tue May 30, 2017 11:31 am

Z88dk unusual behaviour

Postby seedy1812 » Sun Apr 12, 2020 11:52 am

I have been converting a map routine which I wrote for the GBA many years ago for the Next but seeing weird behaviour with the compiler.

My command line is

Code: Select all

zcc +zxn -vn -m test.c -o test.asm -subtype=nex -S --c-code-in-asm -SO3 -O3 --opt-code-speed
and test.c is

Code: Select all


typedef unsigned int           uint16_t;

typedef struct
{
	uint16_t	x,y;
}my_Pos;

void		MAP_SubRegion(uint16_t x1, uint16_t y1,uint16_t x2,uint16_t y2);


my_Pos  			pos[4];
void test1()
{
	MAP_SubRegion(pos[0].x,pos[0].y,pos[1].x,pos[1].y);
	if((pos[1].x < pos[2].x)||(pos[3].x < pos[0].x)||(pos[1].y < pos[2].y)||(pos[3].y < pos[0].y))
	{
		MAP_SubRegion(pos[0].x,pos[0].y,pos[1].x,pos[1].y);
	}
}

volatile my_Pos  	v_pos[4];
void test2()
{
	MAP_SubRegion(v_pos[0].x,v_pos[0].y,v_pos[1].x,v_pos[1].y);

	if((v_pos[1].x < v_pos[2].x)||(v_pos[3].x < v_pos[0].x)||(v_pos[1].y < v_pos[2].y)||(v_pos[3].y < v_pos[0].y))
	{
		MAP_SubRegion(v_pos[0].x,v_pos[0].y,v_pos[1].x,v_pos[1].y);
	}
}
void test3()
{
	my_Pos  	l_pos[4];
	MAP_SubRegion(l_pos[0].x,l_pos[0].y,l_pos[1].x,l_pos[1].y);
	if((l_pos [1].x < l_pos [2].x)||(l_pos [3].x < l_pos [0].x)||(l_pos [1].y < l_pos [2].y)||(l_pos [3].y < l_pos [0].y))
	{
		MAP_SubRegion(l_pos [0].x,l_pos [0].y,l_pos [1].x,l_pos [1].y);
	}
}
The routines are the same just where the pos array is located.
test1 = is a global array
test2 = the global array is volatile
test 3 = the array is on the stack

test2 differs from test1 as the array is volatile it will not store temporary values from the array on the stack as they can change any time.

test3 as the 16 bit values of the array are on the stack there is no quick way to get the value into a 16 bit register so 2 reads have to be done

Code: Select all

;test.c:38: MAP_SubRegion(l_pos [0].x,l_pos [0].y,l_pos [1].x,l_pos [1].y);
	ld	l,(ix-6)	; 19 clock cycles
	ld	h,(ix-5)	; 19 clock cycles
compared to

Code: Select all

l_test2_00101:
;test.c:29: MAP_SubRegion(v_pos[0].x,v_pos[0].y,v_pos[1].x,v_pos[1].y);
	ld	de,(_v_pos + 0x0006) ; 20 clock cycles
	
test1 allows the creation of temporary variables which are then stored on the stack - this may be a copy of a variable or the result of an operation ( a = b -c ). In this instance the comparison creates this code

Code: Select all

;test.c:16: if((pos[1].x < pos[2].x)||(pos[3].x < pos[0].x)||(pos[1].y < pos[2].y)||(pos[3].y < pos[0].y))
	ld	hl,(_pos + 0x0004)
	ld	(ix-6),l
	ld	(ix-5),h
	ld	hl,(_pos + 8)
	ld	(ix-4),l
	ld	(ix-3),h
	ld	hl,(_pos + 0x0006)
	ld	(ix-2),l
	ld	(ix-1),h
	ld	bc,(_pos + 0x0002)
	ld	de,(_pos)
	ld	a,(ix-6)
	sub	a,(ix-4)
	ld	a,(ix-5)
	sbc	a,(ix-3)
	jr	C,l_test1_00101
	ld	hl, (_pos + 12)
	xor	a, a
	sbc	hl, de
	jr	C,l_test1_00101
	ld	hl, (_pos + 10)
	ld	a,(ix-2)
	sub	a, l
	ld	a,(ix-1)
	sbc	a, h
	jr	C,l_test1_00101
	ld	hl, (_pos + 14)
	xor	a, a
	sbc	hl, bc
	jr	NC,l_test1_00106

Here the code copies 3 16 bit values on the stack as temporary values which is decides to use instead of the global values.

Code: Select all

;test.c:18: MAP_SubRegion(pos[0].x,pos[0].y,pos[1].x,pos[1].y);
	ld	l,(ix-2)
	ld	h,(ix-1)
	push	hl
	ld	l,(ix-6)
	ld	h,(ix-5)
	push	hl
	push	bc
	push	de
	call	_MAP_SubRegion
	ld	hl,8
	add	hl, sp
	ld	sp, hl
	
When temporary variables are not an issue the compiler still produces code which is a bit quirky

Code: Select all

;test.c:15: MAP_SubRegion(pos[0].x,pos[0].y,pos[1].x,pos[1].y);
	ld	de,(_pos + 0x0006)
	ld	hl,(_pos + 0x0004)
	ld	(ix-2),l
	ld	(ix-1),h
	ld	bc,(_pos + 0x0002)
	ld	hl,(_pos)
	push	de
	ld	e,(ix-2)
	ld	d,(ix-1)
	push	de
	push	bc
	push	hl
	call	_MAP_SubRegion
	ld	hl,8
	add	hl, sp
	ld	sp, hl
Why is it storing variables on the stack frame adding 80 cycles when it could be

Code: Select all

;test.c:15: MAP_SubRegion(pos[0].x,pos[0].y,pos[1].x,pos[1].y);
	ld	de,(_pos + 0x0006)
	ld	hl,(_pos + 0x0004)
	ld	bc,(_pos + 0x0002)
	push	de
	push	hl
	ld	hl,(_pos)
	push	bc
	push	hl
	call	_MAP_SubRegion
	ld	hl,8
	add	hl, sp
	ld	sp, hl
The code from which this comes from is a 272 line .c file and an 80 line header which does not include any other headers which if i have the global pos array volatile takes 14 seconds to compile otherwise it takes 360 seconds !

seedy1812
Posts: 91
Joined: Tue May 30, 2017 11:31 am

Re: Z88dk unusual behaviour

Postby seedy1812 » Sun Apr 12, 2020 5:31 pm

Doing a bit of research on function calling it appear that if you pass more than 3 values it produces slow code

Code: Select all

typedef unsigned int           uint16_t;

void		MAP_SubRegion3(uint16_t x1, uint16_t y1,uint16_t x2);
void		MAP_SubRegion4(uint16_t x1, uint16_t y1,uint16_t x2,uint16_t y2);
void		MAP_SubRegion5(uint16_t x1, uint16_t y1,uint16_t x2,uint16_t y2,uint16_t x3);
void		MAP_SubRegion6(uint16_t x1, uint16_t y1,uint16_t x2,uint16_t y2,uint16_t x3,uint16_t y3);
void		MAP_SubRegion7(uint16_t x1, uint16_t y1,uint16_t x2,uint16_t y2,uint16_t x3,uint16_t y3,uint16_t x4);


uint16_t  			ipos[10];
void test0_3()
{
	MAP_SubRegion3(ipos[0],ipos[1],ipos[2]);
}

void test0_4()
{
	MAP_SubRegion4(ipos[0],ipos[1],ipos[2],ipos[3]);
}

void test0_5()
{
	MAP_SubRegion5(ipos[0],ipos[1],ipos[2],ipos[3],ipos[4]);
}

void test0_6()
{
	MAP_SubRegion6(ipos[0],ipos[1],ipos[2],ipos[3],ipos[4],ipos[5]);
}

void test0_7()
{
	MAP_SubRegion7(ipos[0],ipos[1],ipos[2],ipos[3],ipos[4],ipos[5],ipos[6]);
}

Code: Select all

_test0_7:
	push	ix
	ld	ix,0
	add	ix,sp
	ld	hl, -8
	add	hl, sp
	ld	sp, hl
;test.c:34: MAP_SubRegion7(ipos[0],ipos[1],ipos[2],ipos[3],ipos[4],ipos[5],ipos[6]);
	ld	de, (_ipos + 12)
	ld	hl,(_ipos + 10)
	ld	(ix-8),l
	ld	(ix-7),h
	ld	hl,(_ipos + 8)
	ld	(ix-6),l
	ld	(ix-5),h
	ld	hl,(_ipos + 6)
	ld	(ix-4),l
	ld	(ix-3),h
	ld	hl,(_ipos + 4)
	ld	(ix-2),l
	ld	(ix-1),h
	ld	bc,(_ipos + 2)
	ld	hl,(_ipos)
	push	de
	ld	e,(ix-8)
	ld	d,(ix-7)
	push	de
	ld	e,(ix-6)
	ld	d,(ix-5)
	push	de
	ld	e,(ix-4)
	ld	d,(ix-3)
	push	de
	ld	e,(ix-2)
	ld	d,(ix-1)
	push	de
	push	bc
	push	hl
	call	_MAP_SubRegion7
	ld	hl,14
	add	hl, sp
	ld	sp, hl
;test.c:35: }
	ld	sp, ix
	pop	ix
	ret
It appears that's it calculates all the values and from the 4th values onwards it saves them on frame stack (ix-nn ) and then reads them before pushing to the stack. This gives each parameter a 4*19 T over head.

seedy1812
Posts: 91
Joined: Tue May 30, 2017 11:31 am

Re: Z88dk unusual behaviour

Postby seedy1812 » Mon Apr 13, 2020 11:05 am

Code: Select all

typedef unsigned int           uint16_t;

void		MAP_SubRegion1(uint16_t x1);
uint16_t  			ipos[10];

void text_0_sub()
{
	MAP_SubRegion1(ipos[4]-ipos[5]);	
}
This produces

Code: Select all

;	---------------------------------
; Function text_0_sub
; ---------------------------------
_text_0_sub:
	ld	bc, (_ipos + 8)
	ld	hl, (_ipos + 10)
	ld	a, c
	sub	a, l
	ld	c, a
	ld	a, b
	sbc	a, h
	ld	b, a
	push	bc
	call	_MAP_SubRegion1
	pop	af
	ret
Surely a 16 bit sbc would be quicker and smaller . The 6 instructions take 24 cycles , where as a xor a + sbc hl,bc ( 4+15 = 19 cycles )

seedy1812
Posts: 91
Joined: Tue May 30, 2017 11:31 am

Re: Z88dk unusual behaviour

Postby seedy1812 » Mon Apr 13, 2020 11:23 am

Code: Select all

void test0_c1()
{
	MAP_SubRegion3(ipos[3],ipos[1],ipos[2]);
	if((ipos[1] < ipos[2])||(ipos[3] < ipos[0]))
	{
		MAP_SubRegion3(ipos[0],ipos[1],ipos[2]);
	}
}
This innocent looking code has 2 non optimised issues and 1 compilation flaw.

Code: Select all

;	---------------------------------
; Function test0_c1
; ---------------------------------
_test0_c1:
	push	ix
	ld	ix,0
	add	ix,sp
	push	af
	ld	de,(_ipos + 0x0004)
	ld	bc,(_ipos + 0x0002)
	ld	hl,(_ipos + 0x0006)
	push	de
	push	bc
	push	hl
	call	_MAP_SubRegion3
	ld	hl,6
	add	hl, sp
	ld	sp, hl
	ld	bc,(_ipos + 0x0002)
	ld	hl,(_ipos + 0x0004)
	ld	(ix-2),l
	ld	(ix-1),h
	ld	de,(_ipos)
	ld	a,c
	sub	a,(ix-2)
	ld	a, b
	sbc	a,(ix-1)
	jr	C,l_test0_c1_00101
	ld	hl,(_ipos + 0x0006)
	xor	a, a
	sbc	hl, de
	jr	NC,l_test0_c1_00104
l_test0_c1_00101:
	pop	hl
	push	hl
	push	hl
	push	bc
	push	de
	call	_MAP_SubRegion3
	ld	hl,6
	add	hl, sp
	ld	sp, hl
l_test0_c1_00104:
	ld	sp, ix
	pop	ix
	ret
The first being

Code: Select all

	ld	bc,(_ipos + 0x0002)
	ld	hl,(_ipos + 0x0004)
	ld	(ix-2),l
	ld	(ix-1),h
	ld	de,(_ipos)
	ld	a,c
	sub	a,(ix-2)
	ld	a, b
	sbc	a,(ix-1)
	jr	C,l_test0_c1_00101
As ld de,(address ) does not effect the flags it can be moved after the sub part and then you do not need to write / read to a temporary variable - which will save you 4* 19 T states , and the like the previous message you can do a 16 bit sub saving another 6 cycles

Code: Select all

	ld	hl,(_ipos + 0x0002)
	ld	bc,(_ipos + 0x0004)
	xor a
	sbc hl,bc
	ld	de,(_ipos)
	jr	C,l_test0_c1_00101
This code is a bit clever

Code: Select all

l_test0_c1_00101:
	pop	hl
	push	hl
	push	hl
	push	bc
	push	de
	call	_MAP_SubRegion3
	ld	hl,6
	add	hl, sp
	ld	sp, hl
The pop hl / push hl / push hl duplicates the last value on the stack ready for the call but none of them are needed.

Code: Select all

	push	bc
	push	de
	call	_MAP_SubRegion3
	ld	hl,4
	add	hl, sp
	ld	sp, hl
This would be a nice speed up but the code generation is flawed. The routine has 9 pushes against 2 pops and adding 12 bytes to the stack ( ie 6 pops ).

User avatar
varmfskii
Posts: 287
Joined: Fri Jun 23, 2017 1:13 pm
Location: Stone Mountain, GA USA

Re: Z88dk unusual behaviour

Postby varmfskii » Mon Apr 13, 2020 4:00 pm

I fail to see the code generation issue. The stack pointer is restored to the calling state by the following ld sp,ix;pop ix at the end of the function.
Backer #2741 - TS2068, Byte, ZX Evolution

seedy1812
Posts: 91
Joined: Tue May 30, 2017 11:31 am

Re: Z88dk unusual behaviour

Postby seedy1812 » Mon Apr 13, 2020 4:58 pm

You are right , I had missed the ld sp,ix :)

Alcoholics Anonymous
Posts: 777
Joined: Mon May 29, 2017 7:00 pm

Re: Z88dk unusual behaviour

Postby Alcoholics Anonymous » Tue Apr 14, 2020 6:42 am

I posted an answer on the z88dk forum:
https://github.com/z88dk/z88dk/issues/1439

To summarize, the calling issue with more than three parameters is a long-standing annoyance that comes about because sdcc wants to gather *all* parameters before generating a stack frame. Unfortunately it's very difficult to fix up post- code generation and should be fixed in the code generator instead. The "fix" is to change the function to use global statics to pass some parameters instead of via the stack.


Who is online

Users browsing this forum: No registered users and 2 guests