Release 崩潰 Debug 不崩?
在夸克2.0版本,出現(xiàn)一個在release 下必崩的問題。經(jīng)過一輪排查,終于發(fā)現(xiàn)問題的根源所在。排查過程相當(dāng)耗時,故記錄下來,方便以后學(xué)習(xí)。
排查過程比較冗長??床幌碌?,可以直接查看結(jié)論。
0x1 崩潰日志
Exception?Type:??SIGSEGV Exception?Codes:?SEGV_ACCERR?at?0x0 Crashed?Thread:??1 Thread?1?Crashed: 0???libobjc.A.dylib?????????????????????0x0000000189f35e34?(anonymous?namespace)::AutoreleasePoolPage::pop(void*)?+?68 1???Quark???????????????????????????????0x0000000100590494?__65-[CMSDataModel?saveSubRes:resCode:subResUrl:decompress:callback:]_block_invoke?(CMSDataModel.mm:258) 2???libdispatch.dylib???????????????????0x000000018a36e9e0?_dispatch_call_block_and_release?+?24 3???libdispatch.dylib???????????????????0x000000018a36e9a0?_dispatch_client_callout?+?16 4???libdispatch.dylib???????????????????0x000000018a37d0d4?_dispatch_queue_override_invoke?+?644 5???libdispatch.dylib???????????????????0x000000018a37ea50?_dispatch_root_queue_drain?+?540 6???libdispatch.dylib???????????????????0x000000018a37e7d0?_dispatch_worker_thread3?+?124 7???libsystem_pthread.dylib?????????????0x000000018a577100?_pthread_wqthread?+?1096 8???libsystem_pthread.dylib?????????????0x000000018a576cac?start_wqthread?+?4
從崩潰日志知道,崩潰發(fā)生的原因是訪問了無效的地址0x0(SIGSEGV),查看崩潰的調(diào)用現(xiàn)場,很直接,崩潰的發(fā)生的地方在:
[CMSDataModel saveSubRes:resCode:subResUrl:decompress:callback:]
下面貼出關(guān)鍵段的代碼:
?dispatch_async(self.ioQueue,?^{ ????????????BOOL?ret?=?NO; ????????????NSFileManager?*fileManager?=?[[NSFileManager?alloc]?init]; ????????????do ????????????{ ????????????????NSString?*path?=?[self?subResFilePathOfOnline:resCode?subResUrl:subResUrl?filePath:nil?createDir:YES]; ????????????????if?(![subResData?writeToFile:path?atomically:YES]) ????????????????{ ????????????????????[fileManager?removeItemAtPath:path?error:nil]; ????????????????????break; ????????????????} ????????????????if?(bDecompress) ????????????????{ ????????????????????ZipArchive?*archive?=?[[ZipArchive?alloc]?init]; ????????????????????if?(![archive?UnzipOpenFile:path]) ????????????????????{ ????????????????????????[fileManager?removeItemAtPath:path?error:nil]; ????????????????????????[archive?UnzipCloseFile]; ????????????????????????break; ????????????????????} ???? ????????????????????//再次訪問Self的時候發(fā)生崩潰 ????????????????????NSString*?tempFolder?=?[self?subResFolderOfOnline:resCode?subResUrl:subResUrl?isTemp:YES]; ????????????????????if?(![archive?UnzipFileTo:tempFolder?overWrite:YES]) ????????????????????{ ????????????????????????[fileManager?removeItemAtPath:path?error:nil]; ????????????????????????[fileManager?removeItemAtPath:tempFolder?error:nil]; ????????????????????????[archive?UnzipCloseFile]; ????????????????????????break; ????????????????????} ????????????????????[fileManager?removeItemAtPath:path?error:nil]; ????????????????????[archive?UnzipCloseFile]; ????????????????????NSString?*resFolder?=?[self?subResFolderOfOnline:resCode?subResUrl:subResUrl?isTemp:NO]; ????????????????????if?([fileManager?fileExistsAtPath:resFolder]) ????????????????????{ ????????????????????????[fileManager?removeItemAtPath:resFolder?error:nil]; ????????????????????} ????????????????????if?(![fileManager?moveItemAtPath:tempFolder?toPath:resFolder?error:nil]) ????????????????????{ ????????????????????????[fileManager?removeItemAtPath:tempFolder?error:nil]; ????????????????????} ????????????????} ????????????????ret?=?YES; ????????????}?while?(NO); ????????????if?(callback) ????????????{ ????????????????dispatch_async(dispatch_get_main_queue(),?^{ ????????????????????callback(ret,?resCode,?subResUrl); ????????????????}); ????????????} ????????});
因為崩潰只發(fā)生在release包,所有把scheme改成release,模擬案發(fā)場景。果然程序很快崩潰在:
?NSString*?tempFolder?=?[self?subResFolderOfOnline:resCode?subResUrl:subResUrl?isTemp:YES];
訪問self 的時候崩潰了!這看起來不可能呀!
確定的是self 是肯定不會被釋放的,而且也不存在多線程讀寫的問題。那么訪問self怎么就會崩潰了呢!這不科學(xué)呀!
當(dāng)時因為趕著發(fā)包,需要馬上解決這個問題。經(jīng)過幾輪修改,最后的解決方案是,把do ,while 語句去掉(我也不知道為什么想到這么改)。發(fā)現(xiàn)居然解決了崩潰問題,那時候懷疑的是編譯選項的問題。但是好像也說不過去呀,關(guān)do while毛線事呀。
勁總問起,因為還沒清楚原因,只是說把do while 改了就修復(fù)崩潰了,懷疑與編譯選項優(yōu)化有關(guān)。
嗯嗯,問題解決了,先發(fā)包吧!
0x2 你良心不會痛嗎?
正式包上線后,問題被修復(fù)了。但是這么嚴重怪異的崩潰問題,就這么敷衍的修復(fù)了?會不會掩蓋了更嚴重的問題呀?
于是決定研究一下,防止以后再遇到這種問題而無從入手。既然可能是因為編譯選項的問題,那么先從匯編上分析一下吧。以下是用hopper查看該函數(shù)的匯編代碼(部分):
//?0x100d81000?+?0x8d0?定位到?NSFileManager 0000000100551e28?????????adrp???????x8,?#0x100d81000 0000000100551e2c?????????ldr????????x0,?[x8,?#0x8d0] //?0x100d5c000?+?0xe00?定位到?alloc 0000000100551e30?????????adrp???????x8,?#0x100d5c000 0000000100551e34?????????ldr????????x23,?[x8,?#0xe00] 0000000100551e38?????????mov????????x1,?x23 0000000100551e3c?????????bl?????????imp___stubs__objc_msgSend //?0x100d5c000?+?0xcd8?定位到?init 0000000100551e40?????????adrp???????x8,?#0x100d5c000 0000000100551e44?????????ldr????????x22,?[x8,?#0xcd8] 0000000100551e48?????????mov????????x1,?x22 0000000100551e4c?????????bl?????????imp___stubs__objc_msgSend 0000000100551e50?????????mov????????x19,?x0 0000000100551e54?????????ldp????????x0,?x2,?[x20,?#0x20] 0000000100551e58?????????ldr????????x3,?[x20,?#0x30] //?0x100d72000?+?0x250?定位到?subResFilePathOfOnline:subResUrl:filePath:createDir 0000000100551e5c?????????adrp???????x8,?#0x100d72000 0000000100551e60?????????ldr????????x1,?[x8,?#0x250] 0000000100551e64?????????orr????????w5,?wzr,?#0x1 0000000100551e68?????????movz???????x4,?#0x0 0000000100551e6c?????????bl?????????imp___stubs__objc_msgSend 0000000100551e70?????????mov????????x29,?x29 0000000100551e74?????????bl?????????imp___stubs__objc_retainAutoreleasedReturnValue 0000000100551e78?????????mov????????x21,?x0 0000000100551e7c?????????ldr????????x0,?[x20,?#0x38] //?0x100d5d000?+?0x120?定位到?writeToFile:atomically: 0000000100551e80?????????adrp???????x8,?#0x100d5d000 0000000100551e84?????????ldr????????x1,?[x8,?#0x120] 0000000100551e88?????????orr????????w3,?wzr,?#0x1 0000000100551e8c?????????mov????????x2,?x21 0000000100551e90?????????bl?????????imp___stubs__objc_msgSend 0000000100551e94?????????tbz????????w0,?0x0,?-[CMSDataModel?saveSubRes:resCode:subResUrl:decompress:callback:]+972 0000000100551e98?????????ldrb???????w8,?[x20,?#0x48] 0000000100551e9c?????????cbz????????w8,?-[CMSDataModel?saveSubRes:resCode:subResUrl:decompress:callback:]+1000 //?0x100d84000?+?0x110?定位到?ZipArchive 0000000100551ea0?????????adrp???????x8,?#0x100d84000 0000000100551ea4?????????ldr????????x0,?[x8,?#0x110] 0000000100551ea8?????????mov????????x1,?x23 0000000100551eac?????????bl?????????imp___stubs__objc_msgSend 0000000100551eb0?????????mov????????x1,?x22 0000000100551eb4?????????bl?????????imp___stubs__objc_msgSend 0000000100551eb8?????????mov????????x22,?x0 //?0x100d72000?+?0x258?定位到?UnzipOpenFile 0000000100551ebc?????????adrp???????x8,?#0x100d72000 0000000100551ec0?????????ldr????????x1,?[x8,?#0x258] 0000000100551ec4?????????mov????????x0,?x22 0000000100551ec8?????????mov????????x2,?x21 0000000100551ecc?????????bl?????????imp___stubs__objc_msgSend 0000000100551ed0?????????tbz????????w0,?0x0,?-[CMSDataModel?saveSubRes:resCode:subResUrl:decompress:callback:]+1008 //按照調(diào)用約定,,x0?寄存器保存self,x0?=?x20+0x20?,?x0作為函數(shù)調(diào)用的第一個參數(shù)傳入。但是發(fā)現(xiàn)x0?寄存器的值是0x20,導(dǎo)致訪問self?的時候發(fā)生崩潰。這是典型的空指針問題呀~~~ 0000000100551ed4?????????ldr????????x0,?[x20,?#0x20] 0000000100551ed8?????????adrp???????x8,?#0x100d72000
那么關(guān)鍵是要看x20寄存器的值,究竟是誰動了x20寄存器!
0x3 誰動了x20寄存器
發(fā)生崩潰的原因很明顯,就是x20的值被修改了,只要查出是誰動了0x20寄存器,就知道這是誰的pot了。
1.png
在執(zhí)行bl ? ?0x1009aaac4前,通過lldb查看$x20+0x20地址上的值是多少。
bl ? ?0x1009aaac4 執(zhí)行后,x20 就被修改了。那么關(guān)鍵就要看:bl ? ?0x1009aaac4做了什么。下面是調(diào)用0x1009aaac4 后的匯編代碼:
2.png
x20 的值保存在sp+0x10上,待函數(shù)執(zhí)行完后,x20重新恢復(fù)原來的值,x20 = sp+0x10.那么關(guān)鍵點就落在 觀察sp+0x10是否被修改。觀察一個地址是否被修改,可以通過設(shè)置watchpoint來觀察。
(lldb)?watchpoint?set?expression?$sp+0x10
接著繼續(xù)執(zhí)行,斷點果然被命中了!
3.png
sp 存入x1 后,緊接著就去調(diào)用unzGetGlobalInfo 。
4.png
x1+0x10 其實就是sp+0x10 ,x1+0x10 被修改成0 。接著函數(shù)返回后,
5.png
?0x1008607dc?:?ldp????x20,?x19,?[sp,?#0x10] ? ?x20?=?sp?+?0x10?=?0。 ?x20+0x20?=?0x20??這就是訪問的非法地址!
最后問題的關(guān)鍵在于unzGetGlobalInfo 函數(shù),為什么它會修改 x1+0x10 的值?后面會解析,現(xiàn)在先看看為什么在debug下沒有問題。
0x4 為什么開發(fā)的時候沒問題呀? --- Debug with Debug scheme
同樣的,通過打斷點,查看unzGetGlobalInfo 函數(shù)的匯編代碼。
6.png
這匯編代碼和release下看起來不像是同一份代碼呀!難道有兩個zip庫!
接下來就尷尬了,項目里面真的有兩個zip庫,unzGetGlobalInfo 的實現(xiàn)是不一樣的。
A 版本的:unzGetGlobalInfo
typedef?struct?unz_global_info_s { ????uLong?number_entry;?????????/*?total?number?of?entries?in ???????????????????????the?central?dir?on?this?disk?*/ ????uLong?size_comment;?????????/*?size?of?the?global?comment?of?the?zipfile?*/ }?unz_global_info; extern?int?ZEXPORT?unzGetGlobalInfo?(file,pglobal_info) ????unzFile?file; ????unz_global_info?*pglobal_info; { ????unz_s*?s; ????if?(file==NULL) ????????return?UNZ_PARAMERROR; ????s=(unz_s*)file; ????*pglobal_info=s->gi; ????return?UNZ_OK; }
B 版本的:unzGetGlobalInfo B
typedef?struct?unz_global_info64 { ????unsigned?long?long?number_entry;?????????/*?total?number?of?entries?in ?????????????????????????????????the?central?dir?on?this?disk?*/ ???? ?????unsigned?long?number_disk_with_CD;??/*?number?the?the?disk?with?central?dir,?used?for?spanning?ZIP*/ ???? ????unsigned?long?size_comment;?????????/*?size?of?the?global?comment?of?the?zipfile?*/ }?unz_global_info64; extern?int?ZEXPORT?unzGetGlobalInfo(unzFile?file,?unz_global_info?*pglobal_info32) { ????unz64_s?*s; ????if?(file?==?NULL) ????????return?UNZ_PARAMERROR; ????s?=?(unz64_s?*)file; ????/*?to?do?:?check?if?number_entry?is?not?truncated?*/ ????pglobal_info32->number_entry?=?(uLong)s->gi.number_entry; ????pglobal_info32->size_comment?=?s->gi.size_comment; ????pglobal_info32->number_disk_with_CD?=?s->gi.number_disk_with_CD; ????return?UNZ_OK; } extern?int?ZEXPORT?unzGetGlobalInfo(unzFile?file,?unz_global_info?*pglobal_info32) { ????unz64_s?*s; ????if?(file?==?NULL) ????????return?UNZ_PARAMERROR; ????s?=?(unz64_s?*)file; ????/*?to?do?:?check?if?number_entry?is?not?truncated?*/ ????pglobal_info32->number_entry?=?(uLong)s->gi.number_entry; ????pglobal_info32->size_comment?=?s->gi.size_comment; ????pglobal_info32->number_disk_with_CD?=?s->gi.number_disk_with_CD; ????return?UNZ_OK; }
s->gi 在A版本和B版本的結(jié)構(gòu)體類型是不一樣的,從上面的匯編代碼知道,*pglobal_info=s->gi; 是關(guān)鍵的步驟,會不會由于A版本的unzGetGlobalInfo 函數(shù)用了B版本的s->gi 結(jié)構(gòu)體類型呢?導(dǎo)致賦值的時候發(fā)生錯誤。
0x5 刪除多余的zip 庫
對于上面的假設(shè),可以刪除其中一個zip 庫來驗證。實驗證明,刪除多余的zip 庫后,在release scheme 下,不會崩潰了!
結(jié)論
結(jié)論很簡單,就是因為用了兩個zip 庫,導(dǎo)致編譯器編譯的時候,發(fā)生一些奇異的問題(本人對編譯器不是很熟悉,后面有時間再研究學(xué)習(xí)一下)。大家遇到在release下崩潰,debug不崩的情況,不妨考慮一下是否有兩份一樣的代碼。
遺留的問題
兩份zip 代碼,為什么沒有報duplicate symbols。編譯器是如何選擇哪一份代碼編譯的。
遺留的問題在這里可以查看!