// Matt Wells, copyright Nov 2002

#ifndef GB_SPIDERLOOP_H
#define GB_SPIDERLOOP_H

#include "RdbList.h"
#include "HashTableX.h"
#include "Msg5.h"
#include "hash.h"
#include "RdbCache.h"
#include "FxCache.h"
#include <time.h>
#include <atomic>

// . the spider loop
// . it gets urls to spider from the SpiderCache global class, g_spiderCache
// . supports robots.txt
// . supports <META NAME="ROBOTS" CONTENT="NOINDEX">  (no indexing)
// . supports <META NAME="ROBOTS" CONTENT="NOFOLLOW"> (no links)
// . supports limiting spiders per domain

// . max spiders we can have going at once for this process
// . limit to 50 to preven OOM conditions
#define MAX_SPIDERS 300


class UdpSlot;
class SpiderRequest;
class SpiderColl;
class CollectionRec;
class XmlDoc;
class UrlLock;

class SpiderLoop {
public:
	~SpiderLoop();
	SpiderLoop();

	// free all XmlDocs and m_list
	void reset();

	// . call this no matter what
	// . if spidering is disabled this will sleep about 10 seconds or so
	//   before checking to see if it's been enabled
	void init();
	void initSettings();

	int32_t getNumSpidersOutPerIp ( int32_t firstIp , collnum_t collnum ) ;
	int32_t getNumSpidersOut() const { return m_numSpidersOut; }

	bool isLocked(int64_t key) const;
	int32_t getLockCount() const;
	bool addLock(int64_t key, const UrlLock *lock);
	void removeLock(int64_t key);
	void clearLocks(collnum_t collnum);

	// for spidering/parsing/indexing a url(s)
	XmlDoc *m_docs [ MAX_SPIDERS ];

	RdbCache   m_winnerListCache;

	void invalidateActiveList() { m_activeListValid = false; }
	
	void nukeWinnerListCache(collnum_t collnum);

private:
	static void indexedDocWrapper ( void *state ) ;
	static void doneSleepingWrapperSL ( int fd , void *state ) ;
	static void gotDoledbListWrapper2 ( void *state , RdbList *list , Msg5 *msg5 ) ;

	void spiderDoledUrls ( ) ;
	bool gotDoledbList2  ( ) ;

	// . returns false if blocked and "callback" will be called,
	//   true otherwise
	// . returns true and sets g_errno on error
	bool spiderUrl(SpiderRequest *sreq, const key96_t *doledbKey, collnum_t collnum);
	bool spiderUrl2(SpiderRequest *sreq, const key96_t *doledbKey, collnum_t collnum);

	bool indexedDoc ( XmlDoc *doc );

	CollectionRec *getActiveList();
	void buildActiveList ( ) ;

	std::atomic<int32_t> m_numSpidersOut;

	// . this is "i" where m_msg14[i] is the highest m_msg14 in use
	// . we use it to limit our scanning to the first "i" m_msg14's
	int32_t m_maxUsed;

	int32_t m_launches;

	HashTableX m_lockTable;
	mutable GbMutex m_lockTableMtx;

	FxCache<std::string, void*> m_urlCache;

	// . list for getting next url(s) to spider
	RdbList m_list;

	// for getting RdbLists
	Msg5 m_msg5;

	SpiderColl *m_sc;

	bool m_gettingDoledbList;

	CollectionRec *m_crx;
	CollectionRec *m_activeList;
	CollectionRec *m_bookmark;

	bool m_activeListValid;
	int32_t m_activeListCount;

	uint32_t m_recalcTime;
	bool m_recalcTimeValid;

	int64_t m_doleStart;
};

extern SpiderLoop g_spiderLoop;

#endif // GB_SPIDERLOOP_H