6d7e14d2a4 
					 
					
						
						
							
							Corrects compiler warning: C++11 requires a space between string literal and macro  
						
						 
						
						
						
						
					 
					
						2021-06-19 17:17:06 +00:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						b1ace63607 
					 
					
						
						
							
							codespell: spelling corrections  
						
						 
						
						
						
						
					 
					
						2021-05-06 01:52:55 +10:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						3c140b87aa 
					 
					
						
						
							
							Merge branch 'testing' of  https://github.com/gigablast/open-source-search-engine  into testing  
						
						 
						
						
						
						
					 
					
						2016-03-29 12:42:05 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						cf7ec13de6 
					 
					
						
						
							
							Fix international domain printing bug.  
						
						 
						
						
						
						
					 
					
						2016-03-29 12:41:34 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						136d23816c 
					 
					
						
						
							
							fix hashbang properly  
						
						 
						
						
						
						
					 
					
						2016-03-21 09:29:55 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						61ef806dea 
					 
					
						
						
							
							hash bang fix.  
						
						 
						
						... 
						
						
						
						detect more corruption.
don't dump titledb and spiderdb at same time,
seems to reduce corruption in rdbmem. 
						
						
					 
					
						2016-03-20 12:50:43 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						34b33f478a 
					 
					
						
						
							
							added gb rwtest and exposed seektest and thrutest in gb -h.  
						
						 
						
						... 
						
						
						
						use -o sync when mounting ssds to avoid really slow and spiky
linux file/page cache. allow launching of more than 1 non-disk
thread again. should help with unlinking, intersects, etc. 
						
						
					 
					
						2015-11-30 21:29:17 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						5f1695fab8 
					 
					
						
						
							
							fix url.cpp  
						
						 
						
						
						
						
					 
					
						2015-11-10 00:29:42 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						5061e5d7b5 
					 
					
						
						
							
							normalize utf8 url paths into url encoded sequences.  
						
						 
						
						
						
						
					 
					
						2015-11-09 13:54:32 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						16b6e44bd1 
					 
					
						
						
							
							Show utf8 url in page results.  
						
						 
						
						
						
						
					 
					
						2015-09-21 16:44:40 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						83190e3bbc 
					 
					
						
						
							
							Make punycoded urls printable.  
						
						 
						
						
						
						
					 
					
						2015-09-21 09:17:40 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						5caa219c71 
					 
					
						
						
							
							Reduce false positives by not counting \0 as a non-ascii char in the url.  
						
						 
						
						
						
						
					 
					
						2015-09-14 12:24:50 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						5d724cdcc3 
					 
					
						
						
							
							Check for spaces before non-ascii chars to reduce false positives.  
						
						 
						
						... 
						
						
						
						Also print the position of non-ascii char to aid debugging.
We still need to handle utf8 chars in path. 
						
						
					 
					
						2015-09-14 11:11:56 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						519b2c4f42 
					 
					
						
						
							
							Fix repeating xn--xn-- when there are spaces in the domain.  
						
						 
						
						... 
						
						
						
						Make gb unittest take a name of the unit test to run. 
						
						
					 
					
						2015-09-14 10:24:22 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						519017828c 
					 
					
						
						
							
							Enable punycode domains for testing.  
						
						 
						
						... 
						
						
						
						We still need to display them as utf8 on the front end. 
						
						
					 
					
						2015-09-14 09:32:25 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						5622ca47ee 
					 
					
						
						
							
							Work on non-ascii domain names.  It works on correct inputs, but  
						
						 
						
						... 
						
						
						
						will crash on some non correct inputs, so it is forced to be disabled. 
						
						
					 
					
						2015-09-14 00:34:44 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						cb6ca24c26 
					 
					
						
						
							
							Allow nospider and noquery on the same host.  
						
						 
						
						... 
						
						
						
						Fix punycoding of non-ascii domains. 
						
						
					 
					
						2015-09-13 17:15:31 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						911b2837ca 
					 
					
						
						
							
							Merge branch 'testing' of  https://github.com/gigablast/open-source-search-engine  into testing  
						
						 
						
						... 
						
						
						
						Conflicts:
	Makefile
	Spider.cpp 
						
						
					 
					
						2015-09-12 15:51:59 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						77bd8dcff9 
					 
					
						
						
							
							Start to detect non-asci urls and encode them to ascii.  
						
						 
						
						... 
						
						
						
						(Work In Progress) 
						
						
					 
					
						2015-09-12 15:47:33 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						5c89bde956 
					 
					
						
						
							
							now all container doc logic is in xmldoc  
						
						 
						
						... 
						
						
						
						and out of pageinject. compiles. needs testing. 
						
						
					 
					
						2015-05-01 20:32:54 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						0ca27638bc 
					 
					
						
						
							
							checkpoint. moved warc and arc looping into xmldoc.  
						
						 
						
						... 
						
						
						
						now will any container doc from pageinject into
xmldoc. simplifies pageinject.cpp a lot. and sets up
a framework for dealing with container docs. 
						
						
					 
					
						2015-05-01 19:11:13 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						184b157365 
					 
					
						
						
							
							Merge branch 'diffbot-testing' into ia  
						
						 
						
						
						
						
					 
					
						2015-04-29 21:43:00 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						09a79d230c 
					 
					
						
						
							
							check for .css?* better as media extensions.  
						
						 
						
						... 
						
						
						
						do it when adding outlinks in xmldoc.cpp. 
						
						
					 
					
						2015-04-28 14:42:04 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						0eb415d408 
					 
					
						
						
							
							added preliminary support for spidering .warc.gz and .arc.gz files  
						
						 
						
						
						
						
					 
					
						2015-04-27 21:41:22 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						f4ca6d8cd4 
					 
					
						
						
							
							try ddomain only urls with www. when looking up  
						
						 
						
						... 
						
						
						
						in sitelinks.txt 
						
						
					 
					
						2015-01-31 15:33:37 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						87285ba3cd 
					 
					
						
						
							
							use gbmemcpy not memcpy so we can get profiler working again  
						
						 
						
						... 
						
						
						
						since memcpy can't be interrupted and backtrace() called. 
						
						
					 
					
						2015-01-13 12:25:42 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						a935e68484 
					 
					
						
						
							
							still do url extension check, but just remove  
						
						 
						
						... 
						
						
						
						certain ambiguous ones from the list to fix
Mr.T. 
						
						
					 
					
						2015-01-08 11:09:16 -08:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						931a1c4bc6 
					 
					
						
						
							
							good checkpoint. quite a few fixes.  
						
						 
						
						
						
						
					 
					
						2014-11-17 18:13:36 -08:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						96b8197ad3 
					 
					
						
						
							
							now it compiles with -m32  
						
						 
						
						
						
						
					 
					
						2014-11-10 14:45:11 -08:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						e7dd8f7956 
					 
					
						
						
							
							replace long long with int64_t  
						
						 
						
						
						
						
					 
					
						2014-10-30 13:36:39 -06:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						cc1ceaaac2 
					 
					
						
						
							
							fix nyt.com cookie redir bug.  
						
						 
						
						... 
						
						
						
						fixed bug when POSTing injection request with multipart/form-data. 
						
						
					 
					
						2014-08-05 17:04:11 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						146e45db56 
					 
					
						
						
							
							try to fix some redirect issues  
						
						 
						
						
						
						
					 
					
						2014-07-31 10:34:03 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						c3a823c99d 
					 
					
						
						
							
							fix relative url bug when relative url starts with ?  
						
						 
						
						
						
						
					 
					
						2014-06-03 10:54:50 -07:00  
					
					
						 
						
						
							
							
							 
							
							
							
							
							 
						
					 
				 
			
				
					
						
					 
					
						
						
							
						
						f6e560c1f4 
					 
					
						
						
							
							Initial file population.  
						
						 
						
						
						
						
					 
					
						2013-08-02 13:12:24 -07:00