Forums

Sega Master System / Mark III / Game Gear
SG-1000 / SC-3000 / SF-7000 / OMV
Home - Forums - Games - Scans - Maps - Cheats - Credits
Music - Videos - Development - Hacks - Translations - Homebrew

View topic - Another Z88DK-SDCC difference

Reply to topic
Author Message
  • Joined: 28 Jan 2017
  • Posts: 549
  • Location: Málaga, Spain
Reply with quote
Another Z88DK-SDCC difference
Post Posted: Sat Jun 10, 2017 2:58 pm
Lets see... it funcs different in SDCC and Z88DK (with SO2). In SDCC works as expected. It seems in Z88DK the param for the first UNSAFE_SMS_VRAMmemcpy64 are not well computed. Maybe the second too.

Both compilers downloaded today :)

void UpdateParallax(unsigned char*tiles, unsigned char tilesbank, unsigned char base,unsigned char snap)
{
   if(stageframe2mod==0)
   {   
      changeBank(tilesbank);
      UNSAFE_SMS_VRAMmemcpy64((256+(base*4))*32,tiles+((stageframe%snap)<<7));
      UNSAFE_SMS_VRAMmemcpy64((256+2+(base*4))*32,tiles+64+((stageframe%snap)<<7));
      changeBank(FIXEDBANKSLOT);
   }
}

(stageframe is unsigned int)...

Z88DK:

_UpdateParallax:
   push   ix
   ld   ix,0
   add   ix,sp
   push   af
   ld   a,(_stageframe2mod)
   or   a, a
   jp   NZ, l_UpdateParallax_00103
   ld   a,(ix+6)
   push   af
   inc   sp
   call   _changeBank
   inc   sp
   ld   a,(ix+8)
   ld   (ix-2),a
   ld   (ix-1),0x00
   pop   hl
   push   hl
   push   hl
   ld   hl,_stageframe
   ld   c, (hl)
   inc   hl
   ld   b, (hl)
   push   bc
   call   __moduint_callee
   ld   c, l
   ld   b, h
   sla   c
   rl   b
   sla   c
   rl   b
   sla   c
   rl   b
   sla   c
   rl   b
   sla   c
   rl   b
   sla   c
   rl   b
   sla   c
   rl   b
   ld   a,(ix+4)
   add   a, c
   ld   c, a
   ld   a,(ix+5)
   adc   a, b
   ld   b, a
   ld   e,(ix+7)
   ld   d,0x00
   sla   e
   rl   d
   sla   e
   rl   d
   ld   hl,0x0100
   add   hl, de
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   push   de
   push   bc
   push   hl
   call   _UNSAFE_SMS_VRAMmemcpy64_callee
   pop   de
   ld   a,(ix+4)
   add   a,0x40
   ld   c, a
   ld   a,(ix+5)
   adc   a,0x00
   ld   b, a
   push   bc
   push   de
   ld   l,(ix-2)
   ld   h,(ix-1)
   push   hl
   ld   hl,_stageframe
   ld   c, (hl)
   inc   hl
   ld   b, (hl)
   push   bc
   call   __moduint_callee
   ld   (ix-1),h
   ld   (ix-2),l
   pop   de
   pop   bc
   pop   hl
   push   hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl,bc
   ld   c, l
   ld   b, h
   ld   hl,0x0102
   add   hl, de
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   push   bc
   push   hl
   call   _UNSAFE_SMS_VRAMmemcpy64_callee
   ld   a,0x09
   push   af
   inc   sp
   call   _changeBank
   inc   sp
l_UpdateParallax_00103:
   ld   sp, ix
   pop   ix
   ret
   SECTION code_compiler

SDCC:

;   ---------------------------------
; Function UpdateParallax
; ---------------------------------
_UpdateParallax::
   push   ix
   ld   ix,#0
   add   ix,sp
   push   af
;stagefunctions.h:3: if(stageframe2mod==0)
   ld   a,(#_stageframe2mod + 0)
   or   a, a
   jp   NZ, 00103$
;stagefunctions.h:5: changeBank(tilesbank);
   ld   a, 6 (ix)
   push   af
   inc   sp
   call   _changeBank
   inc   sp
;stagefunctions.h:6: UNSAFE_SMS_VRAMmemcpy64((256+(base*4))*32,tiles+((stageframe%snap)<<7));
   ld   a, 8 (ix)
   ld   -2 (ix), a
   ld   -1 (ix), #0x00
   pop   hl
   push   hl
   push   hl
   ld   hl, (_stageframe)
   push   hl
   call   __moduint
   pop   af
   pop   af
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   ld   c, l
   ld   b, h
   ld   l,4 (ix)
   ld   h,5 (ix)
   add   hl, bc
   ld   c, l
   ld   b, h
   ld   l, 7 (ix)
   ld   h, #0x00
   add   hl, hl
   add   hl, hl
   ex   de,hl
   ld   hl, #0x0100
   add   hl, de
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   push   de
   push   bc
   push   hl
   call   _UNSAFE_SMS_VRAMmemcpy64
   pop   af
   pop   af
   pop   de
;stagefunctions.h:7: UNSAFE_SMS_VRAMmemcpy64((256+2+(base*4))*32,tiles+64+((stageframe%snap)<<7));
   ld   a, 4 (ix)
   add   a, #0x40
   ld   c, a
   ld   a, 5 (ix)
   adc   a, #0x00
   ld   b, a
   push   bc
   push   de
   ld   l,-2 (ix)
   ld   h,-1 (ix)
   push   hl
   ld   hl, (_stageframe)
   push   hl
   call   __moduint
   pop   af
   pop   af
   pop   de
   pop   bc
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl,bc
   ld   c, l
   ld   b, h
   ld   hl, #0x0102
   add   hl, de
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   push   bc
   push   hl
   call   _UNSAFE_SMS_VRAMmemcpy64
   pop   af
;stagefunctions.h:8: changeBank(FIXEDBANKSLOT);
   ld   h,#0x09
   ex   (sp),hl
   inc   sp
   call   _changeBank
   inc   sp
00103$:
   ld   sp, ix
   pop   ix
   ret

Kind Regards
  View user's profile Send private message
  • Joined: 17 Nov 2015
  • Posts: 97
  • Location: Canada
Reply with quote
Post Posted: Sun Jun 11, 2017 12:52 am
The SO2 compile looks right to me. I'll have a look again to see if I can spot the problem. An error in SO2 is usually an issue from sdcc since there's a lot less post-processing.

The SO3 compile has a problem. Could you try an SO3 compile again after updating "sdcc_peeph.3" and "sdcc_peeph_cs.3" from z88dk/libsrc/_DEVELOPMENT ?

(For z88dk you can get inlined C as comments if you add "--c-code-in-asm" on the compile line)
  View user's profile Send private message Visit poster's website
  • Joined: 28 Jan 2017
  • Posts: 549
  • Location: Málaga, Spain
Reply with quote
Post Posted: Sun Jun 11, 2017 6:54 am
Maybe the error is elsewhere, or maybe the error were produced by an SO3 compilation (in such case, i am sorry. I have mixed two stages-two modes compiling scripts, to save time)

Have changed your files and now it compiles well both SO2 and SO3 modes. Yesterday were a hard work day and today's version is so different... maybe i did fix something without knowing or has been your fix. The point is i have 0x6xxx of code now and I dont see any bug, so well done! may be soon z88dk SO3 will be bugfree, if not now (so I only have to make a bugfree game of my own, hard work!)

Thanks as usual.

Kind regards.
  View user's profile Send private message
  • Joined: 28 Jan 2017
  • Posts: 549
  • Location: Málaga, Spain
Reply with quote
New fail on z88dk
Post Posted: Sat Jul 01, 2017 11:32 am
Ummm... if I compile with 12-6 version the game will run the same as sdcc.

Updated version for 26-6 one and the game does not run. Same with 28-6 release. Something must have been changed between both releases.

Kind regards
  View user's profile Send private message
  • Joined: 17 Nov 2015
  • Posts: 97
  • Location: Canada
Reply with quote
Post Posted: Sat Jul 01, 2017 3:53 pm
eruiz00 wrote
Ummm... if I compile with 12-6 version the game will run the same as sdcc.

Updated version for 26-6 one and the game does not run. Same with 28-6 release. Something must have been changed between both releases.


Thanks eruiz.. there's an outstanding bug here that will affect all rom model compiles.

I didn't have my thinking cap on when I wrote that as it didn't click that this is a critical bug for all rom compiles, not just specialized ones.

I'll ask Paulo to have a look sooner rather than later.
  View user's profile Send private message Visit poster's website
  • Joined: 28 Jan 2017
  • Posts: 549
  • Location: Málaga, Spain
Reply with quote
Post Posted: Sun Jul 02, 2017 1:10 pm
Waiting anxiously

Now the platformer is more optimized and it runs very well in both sdcc and z88dk, but i like trying on both compilers!
  View user's profile Send private message
  • Joined: 05 Sep 2013
  • Posts: 3769
  • Location: Stockholm, Sweden
Reply with quote
Post Posted: Sun Jul 02, 2017 1:26 pm
I wonder what's the cost of

call   __moduint

... did anyone try to profile that?
  View user's profile Send private message Visit poster's website
  • Joined: 17 Nov 2015
  • Posts: 97
  • Location: Canada
Reply with quote
Post Posted: Mon Jul 03, 2017 12:28 am
Last edited by Alcoholics Anonymous on Mon Jul 03, 2017 12:36 am; edited 1 time in total
eruiz00 wrote
Waiting anxiously

Now the platformer is more optimized and it runs very well in both sdcc and z88dk, but i like trying on both compilers!


It's been fixed now eruiz.

The sms examples successfully compile but I'm not convinced this caused the problem you saw. A black screen sounds like a vdp thing but let's see what happens.

Next build is in about two and half hours or if you're compiling yourself, git is always up to date.
  View user's profile Send private message Visit poster's website
  • Joined: 17 Nov 2015
  • Posts: 97
  • Location: Canada
Reply with quote
Post Posted: Mon Jul 03, 2017 12:33 am
sverx wrote
I wonder what's the cost of

call   __moduint

... did anyone try to profile that?


Up to 16-bits, sdcc's math is written in asm. After 16-bits, it's in C and is relatively slow.

Sdcc's implementation of moduint is the standard small-size one with a check for 8-bit parameter to make it faster in some 8-bit multiplication cases.

z88dk's libraries are all-asm so all math is in asm. The 16-bit moduint will be similar performance to sdcc's. However there is an option to build fast integer math that can make these routines much faster as they do leading zeroes elimination and loop unrolling.

If you write a program that covers the "average" cases you want to test, I can time it for you. There is a command line z80 emulator called ticks in the z88dk package that can measure execution time in cycles.
  View user's profile Send private message Visit poster's website
  • Joined: 05 Sep 2013
  • Posts: 3769
  • Location: Stockholm, Sweden
Reply with quote
Post Posted: Mon Jul 03, 2017 7:32 am
I simply meant "wouldn't it be damn slow?". BTW eruiz00 can time it himself, if he uses Emulicious. He can set the profiler 'on' that call and let it run for a while, the profiler will return min/avg/max time.

The point anyway is: if the 'snap' variable isn't in fact changing, there may be some options to save that call, and even if it changes there could be some other 'tricks', I am simply suggesting that it might be worth exploring them.

Also those '<<7' and '*4*32' can be probably turned into faster operations like '((<<8)>>1)' or such. Just my two cents on how to get the most out of his C code.
  View user's profile Send private message Visit poster's website
  • Joined: 28 Jan 2017
  • Posts: 549
  • Location: Málaga, Spain
Reply with quote
Post Posted: Mon Jul 03, 2017 12:26 pm
Those things are important (i count on those compilers to make better optimization than i would) but i found more interesting the custom tricky optimizations and engine design for an specific game and 8 bit small power. Those forgotten today, even in web world, with the appearance of webgl.
  View user's profile Send private message
  • Joined: 28 Jan 2017
  • Posts: 549
  • Location: Málaga, Spain
Reply with quote
Post Posted: Mon Jul 03, 2017 12:41 pm
Btw i see sdcc .map file has changed. Now it not has lcode label so i search the last code label to see how many space for code left.

I see in z88dk (in cmd finishing compilation) more info:

7kb ram free (excepting stack)

19kb free of 32kb.

So what is the real free space for code? I suppose is 7kb but not sure.

Regards
  View user's profile Send private message
  • Joined: 05 Sep 2013
  • Posts: 3769
  • Location: Stockholm, Sweden
Reply with quote
Post Posted: Mon Jul 03, 2017 12:55 pm
with devkitSMS on SDCC your code is (usually) in ROM first 32 KB. Don't know if with z88dk this is different, though.
  View user's profile Send private message Visit poster's website
  • Joined: 17 Nov 2015
  • Posts: 97
  • Location: Canada
Reply with quote
Post Posted: Mon Jul 03, 2017 5:16 pm
Quote

The point anyway is: if the 'snap' variable isn't in fact changing, there may be some options to save that call, and even if it changes there could be some other 'tricks', I am simply suggesting that it might be worth exploring them.

Also those '<<7' and '*4*32' can be probably turned into faster operations like '((<<8)>>1)' or such. Just my two cents on how to get the most out of his C code.


Yes if you can avoid multiplication and division, that would be best.

The -SO3 compile in z88dk will try to turn those "<<7", etc into right shifts on its own and ought to do better than "(<<8)>>1". However this would probably lead to better code on sdcc. As always, if you're in the speed critical path, using the generated asm as guide for optimizing the C is best. sdcc outputs translated asm by default; with z88dk add "-m --list --c-code-in-asm" to get the map file and asm listings. With both compilers you can also translate individual files to asm which can be a lot faster than compiling an entire project to get the list files.

Quote

with devkitSMS on SDCC your code is (usually) in ROM first 32 KB. Don't know if with z88dk this is different, though.


It's the same - the main bank is usually 0-32k, ram at 0xc000 and bankswitching in the 16k bank at 0x8000.

A typical compile (using AstroForce as example) will list some information when the .sms file is made:


Notice: Available RAM space is 6973 bytes ignoring the stack
Adding main banks 0x00,0x01 (1940 bytes free)
Adding bank 0x02 (75 bytes free)
Adding bank 0x03 (19 bytes free)
Adding bank 0x04 (67 bytes free)
Adding bank 0x05 (37 bytes free)
Adding bank 0x06 (79 bytes free)
Adding bank 0x07 (48 bytes free)
Adding bank 0x08 (87 bytes free)
Adding bank 0x09 (130 bytes free)
Adding bank 0x0A (30 bytes free)
Adding bank 0x0B (981 bytes free)
Adding bank 0x0C (175 bytes free)
Adding bank 0x0D (22 bytes free)
Adding bank 0x0E (77 bytes free)
Adding bank 0x0F (1685 bytes free)


The available RAM space is the slack in RAM at 0xc000. So this figure tells you how much space is available for the stack (256 bytes is normally enough) and more variables.

The main banks 0x00 and 0x01 is your bottom 0-32k where normal code is placed. 1940 bytes means there is 1940 bytes for more code.

The rest tells you available space in each 16k bank.

After a compile, z88dk will leave the generated binaries in the build directory:


 16,309 astroforce_BANK_02.bin
 16,365 astroforce_BANK_03.bin
 16,317 astroforce_BANK_04.bin
 16,347 astroforce_BANK_05.bin
 16,305 astroforce_BANK_06.bin
 16,336 astroforce_BANK_07.bin
 16,297 astroforce_BANK_08.bin
 16,254 astroforce_BANK_09.bin
 16,354 astroforce_BANK_0A.bin
 15,403 astroforce_BANK_0B.bin
 16,209 astroforce_BANK_0C.bin
 16,362 astroforce_BANK_0D.bin
 16,307 astroforce_BANK_0E.bin
 14,699 astroforce_BANK_0F.bin
    588 astroforce_BSS.bin
 30,189 astroforce_CODE.bin
    623 astroforce_DATA.bin


*_CODE is your program code destined for the 0-32k area. This plus a stored copy of the DATA section plus any headers is what is placed at address 0. The program can build by storing a straight copy of DATA in rom or it can store a compressed copy of DATA in rom. I tried compression with "zx7 astroforce_DATA.bin" which shrank the file from 623 to 578 bytes but that is only a savings of 45 bytes. You have to save at least ~70-80 bytes to make compression worthwhile because the decompressor itself is about that big. The small compression here is unusual so I suspect that data has already been compressed once. The default build stores a copy of the DATA section but if you wanted to use a compressed DATA section you would add "-pragma-output:CRT_MODEL=2" to the compile line.

*_DATA is your initialized variables and self-modifying code (stuff that is in ram but must be initialized to non-zero values) and *_BSS is the rest of your variables that will be zeroed at startup. DATA+BSS goes into ram (here 623+588 = 1211 bytes leaving 8192-1211 = 6981 bytes free there; if you subtract the sms's memory mapped registers up there you will get the 6973 byte figure already reported).

Quote

Btw i see sdcc .map file has changed. Now it not has lcode label so i search the last code label to see how many space for code left.


You can also see this by looking at the .sms file in a hex viewer or similar. An .sms is just a raw binary with the banks in sequential order. So the first 32k of that file is going to be your main code in 0-32k.

With the output astroforce.sms file, I changed the file extension to astroforce.sms.bin and dragged it onto vs2015 to have a look at it a hexdump. At address 0x785c there was a gap of FFs and then the sms header appeared at 0x7ff0. So that would indicate a space of 0x7ff0 - 0x785c = 1940 bytes available which agrees exactly with the message generated above.
  View user's profile Send private message Visit poster's website
  • Joined: 05 Sep 2013
  • Posts: 3769
  • Location: Stockholm, Sweden
Reply with quote
Post Posted: Mon Jul 03, 2017 9:30 pm
Alcoholics Anonymous wrote
The -SO3 compile in z88dk will try to turn those "<<7", etc into right shifts on its own and ought to do better than "(<<8)>>1". However this would probably lead to better code on sdcc.


I somehow expected SDCC to handle those "<<7" better, at least when applied to a unsigned char casted to unsigned int. It's really all about loading 0x00 to a register and shift right one bit from the other into the msb in there... is z88dk doing something different here or is it applying peep-hole optimizer rules on the generated asm?
  View user's profile Send private message Visit poster's website
  • Joined: 05 Sep 2013
  • Posts: 3769
  • Location: Stockholm, Sweden
Reply with quote
Post Posted: Mon Jul 03, 2017 9:36 pm
eruiz00 wrote
Those things are important (i count on those compilers to make better optimization than i would) but i found more interesting the custom tricky optimizations and engine design for an specific game and 8 bit small power. Those forgotten today, even in web world, with the appearance of webgl.


I understand your point but when you don't get the expected performance from your engine you should probably profile your functions to find what's eating up your CPU horsepower. Divisions will eat a lot, as the Z80 has no hardware divide opcode, and it neither have hardware multiplications... actually I've experienced once that a single divide or a modulus can eat up to 2% of your frame time.
  View user's profile Send private message Visit poster's website
  • Joined: 28 Jan 2017
  • Posts: 549
  • Location: Málaga, Spain
Reply with quote
Post Posted: Wed Jul 05, 2017 7:57 pm
Ummmmm.... this evening i had time to download z88dk latest and test a build and it does not work with so3 neither so2 options. Result is the same, black screen.

It compile anf run well with sdcc (all versions) and z88dk 12-6 version, but does not with 26-6, 28-6 or 5-7 version.

Tell me if you need something (code, sms roms) or i can help some way.

Regards.
  View user's profile Send private message
  • Joined: 28 Jan 2017
  • Posts: 549
  • Location: Málaga, Spain
Reply with quote
Post Posted: Mon Jul 10, 2017 7:27 pm
Hey AA

If this can be of help, downloaded today version and things now funcs again.

I dont know if something has changed. In my computer the only change is i uninstalled python malwarebytes ninja ide visual studio 2015 ce jre jdk netbeans glassfish codeblocks lazarus and PxxxShxP.

Also my computer seems faster and better now. Maybe uninstalling office and even better.

Thanks! (LOL)
  View user's profile Send private message
  • Joined: 17 Nov 2015
  • Posts: 97
  • Location: Canada
Reply with quote
Post Posted: Tue Jul 11, 2017 4:56 am
eruiz00 wrote

If this can be of help, downloaded today version and things now funcs again.


Good to hear. I wasn't quite sure what was going on as everything was working for me on windows. There has been an update in the compiler used to cross-compile the windows binaries so maybe that cleared up something funky.

Quote

I somehow expected SDCC to handle those "<<7" better, at least when applied to a unsigned char casted to unsigned int. It's really all about loading 0x00 to a register and shift right one bit from the other into the msb in there...


Well we don't have perfection in z80 code generation but sdcc is still a capable tool that can get some good results. When we combine sdcc with our rules, for the most part it's beating all the other compilers I've been testing it against, including commercial ones.

Quote

is z88dk doing something different here or is it applying peep-hole optimizer rules on the generated asm?


We haven't touched sdcc's code generation yet. Doing that is not easy because you have to understand the compiler first and that requires some time investment. Anyway, Philip has been responsive in fixing the worst problems despite being busy.

So we're only doing things to improve the code in post-processing (mainly the peepholer) and communicating some information about special library functions to the compiler. The right shift thing is being done in the peephole rules.

sdcc does have bugs in its peephole code that doesn't allow peephole rules to be applied in all situations where they can be so currently you can't just pick up z88dk's set and use them with sdcc. We do plan to feed our changes back to sdcc with patch proposals but what we have now is mainly hacks because the changes were made when we were experimenting with sdcc and didn't know the big picture. The idea is to cleanly implement the changes (and maybe improve them) before feeding back to sdcc.
  View user's profile Send private message Visit poster's website
  • Joined: 07 Aug 2007
  • Posts: 220
  • Location: Yach, Germany
Reply with quote
Post Posted: Sun Jul 30, 2017 1:11 pm
sverx wrote
Alcoholics Anonymous wrote
The -SO3 compile in z88dk will try to turn those "<<7", etc into right shifts on its own and ought to do better than "(<<8)>>1". However this would probably lead to better code on sdcc.


I somehow expected SDCC to handle those "<<7" better, at least when applied to a unsigned char casted to unsigned int. It's really all about loading 0x00 to a register and shift right one bit from the other into the msb in there... is z88dk doing something different here or is it applying peep-hole optimizer rules on the generated asm?


Well, as far as I see, in the code example posted, the <<7 is applied to an unsigned int. And add hl, hl is a really efficient instruction. The 7 add hl, hl generated take 7 bytes and 77 cycles. An alternative would be:


rr h
ld h, l
rr h
ld l, #0
rr l


The code would be substantially faster at 35 cycles, but 2 bytes longer. Even if this gets implemented in SDCC code generation, it would be used for --opt-code-speed only.

Philipp
  View user's profile Send private message Visit poster's website
  • Joined: 07 Aug 2007
  • Posts: 220
  • Location: Yach, Germany
Reply with quote
Post Posted: Sun Jul 30, 2017 1:46 pm
PkK wrote

The code would be substantially faster at 35 cycles, but 2 bytes longer. Even if this gets implemented in SDCC code generation, it would be used for --opt-code-speed only.

Philipp


Well, there would also be an advantage when the operand / result is not in hl. I've implemented the optimization (revision #9972), and in the above code example, without --opt-code-speed, the first sequence of 7 add hl, hl would be replaced by the new code (since the reusult is not in hl), but the second one stays (the operand is in hl, and having the result in hl too provides an advantage for the following addition). With --opt-code-speed both use the new code.

Philipp

P.S.: New code at first <<7:


   call   __moduint
   pop   af
   pop   af
   rr   h
   ld   b, l
   rr   b
   ld   c, #0x00
   rr   c
  View user's profile Send private message Visit poster's website
  • Joined: 05 Sep 2013
  • Posts: 3769
  • Location: Stockholm, Sweden
Reply with quote
Post Posted: Tue Aug 01, 2017 9:41 am
PkK wrote
Well, as far as I see, in the code example posted, the <<7 is applied to an unsigned int.


Yes, I was suggesting to switch to an unsigned char and explicitly do <<8 to cast to an int and >>1 after that.
  View user's profile Send private message Visit poster's website
  • Joined: 17 Nov 2015
  • Posts: 97
  • Location: Canada
Reply with quote
Post Posted: Fri Aug 04, 2017 2:50 pm
PkK wrote

P.S.: New code at first <<7:


   call   __moduint
   pop   af
   pop   af
   rr   h
   ld   b, l
   rr   b
   ld   c, #0x00
   rr   c


FWIW, z88dk generates this at -SO3:


   call   __moduint_callee
   ld   e,l  ;;*
   ld   d,h  ;;*
   xor   a,a
   srl   h
   rr   l
   rra
   ld   h,l
   ld   l,a   ;;*

;; stop here

   ld   c,a
   ex   de,hl
   ld   b,d
   ld   a,(ix+4)
   add   a, c
   ld   c, a
   ld   a,(ix+5)
   adc   a, b
   ld   b, a
   ld   e,(ix+7)
   ld   d,0x00
   ex   de,hl
   add   hl,hl
   add   hl,hl
   ex   de,hl
   ld   hl,0x0100
   add   hl, de
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   add   hl, hl
   push   de
   push   bc
   push   hl
   call   _UNSAFE_SMS_VRAMmemcpy64_callee
   pop   de


It does these substitutions with a combination of special cases (8 'add hl,hl' in a row gets replaced by something, 7 by something else) and by rules that act like induction where you have a base case and then a rule that can expand on the base case when more shifts by one bit are present. Five "add hl,hl" in a row can't be improved - that's why it's still there.

I listed more of the asm listing above because the peepholer should be able to do more for the code above. It should be able to eliminate the instructions marked with "*" above but it is being stopped by incorrect handling of the "ex de,hl" instruction. The peepholer currently treats that as both de and hl being read but in fact, it should simply swap the de and hl registers if it is looking for those and continue searching for a read. I see these extra loads involving de&hl fairly often especially after other rules are employed. I want to look at fixing that up but haven't gotten around to it yet.
  View user's profile Send private message Visit poster's website
  • Joined: 07 Aug 2007
  • Posts: 220
  • Location: Yach, Germany
Reply with quote
Post Posted: Mon Aug 07, 2017 9:01 am
Alcoholics Anonymous wrote


FWIW, z88dk generates this at -SO3:


   call   __moduint_callee
   ld   e,l  ;;*
   ld   d,h  ;;*
   xor   a,a
   srl   h
   rr   l
   rra
   ld   h,l
   ld   l,a   ;;*

;; stop here




Yes. Using a where available can improve the code a bit further. Implemented in SDCC revision #9984.

Philipp
  View user's profile Send private message Visit poster's website
Reply to topic



Back to the top of this page

Back to SMS Power!