import sqlite3
import urllib.error
import ssl
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import urlopen
from bs4 import BeautifulSoup
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS Pages
(id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT,
error INTEGER, old_rank REAL, new_rank REAL)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Links
(from_id INTEGER, to_id INTEGER, UNIQUE(from_id, to_id))''')
cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
# Check to see if we are already in progress...
cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
row = cur.fetchone()
if row is not None:
print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.")
else :
starturl = input('Enter web url or enter: ')
if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/'
if ( starturl.endswith('/') ) : starturl = starturl[:-1]
web = starturl
if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
pos = starturl.rfind('/')
web = starturl[:pos]
if ( len(web) > 1 ) :
cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) )
conn.commit()
# Get the current webs
cur.execute('''SELECT url FROM Webs''')
webs = list()
for row in cur:
webs.append(str(row[0]))
print(webs)
many = 0
while True:
if ( many < 1 ) :
sval = input('How many pages:')
if ( len(sval) < 1 ) : break
many = int(sval)
many = many - 1
cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
try:
row = cur.fetchone()
# print row
fromid = row[0]
url = row[1]
except:
print('No unretrieved HTML pages found')
many = 0
break
print(fromid, url, end=' ')
# If we are retrieving this page, there should be no links from it
cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
try:
document = urlopen(url, context=ctx)
html = document.read()
if document.getcode() != 200 :
print("Error on page: ",document.getcode())
cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )
if 'text/html' != document.info().get_content_type() :
print("Ignore non text/html page")
cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) )
conn.commit()
continue
print('('+str(len(html))+')', end=' ')
soup = BeautifulSoup(html, "html.parser")
except KeyboardInterrupt:
print('')
print('Program interrupted by user...')
break
except:
print("Unable to retrieve or parse page")
cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
conn.commit()
continue
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) )
conn.commit()
# Retrieve all of the anchor tags
tags = soup('a')
count = 0
for tag in tags:
href = tag.get('href', None)
if ( href is None ) : continue
# Resolve relative references like href="/contact"
up = urlparse(href)
if ( len(up.scheme) < 1 ) :
href = urljoin(url, href)
ipos = href.find('#')
if ( ipos > 1 ) : href = href[:ipos]
if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
if ( href.endswith('/') ) : href = href[:-1]
# print href
if ( len(href) < 1 ) : continue
# Check if the URL is in any of the webs
found = False
for web in webs:
if ( href.startswith(web) ) :
found = True
break
if not found : continue
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) )
count = count + 1
conn.commit()
cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
try:
row = cur.fetchone()
toid = row[0]
except:
print('Could not retrieve id')
continue
# print fromid, toid
cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) )
print(count)
cur.close()
I laughed insolently. The young ladies sparkled, and so did Miss Harper, as she asked him who had been the proxy. He might have known by looking a trifle more narrowly; I saw plainly, thrillingly, who he was; but his attention was diverted by some signal from the men he had sent to the fringe of cane; they had found the tracks of horses leading through the canes into the corn. But now he hailed me again. "Here, you! what are you doing at that fence? Who are you?" My companion lifted higher in the saddle with delight. Then soberly he said, "We have got to lose her." I turned inquiringly and he continued: "She has done me the honor to tell me--Miss Harper and me--that if she succeeds in what she is now trying to do--you know?--" [Pg 26] "Please be good enough to have a look at my papers, and then...." The Rajah, a prisoner in his little state, a ruler only in name and deposed from his power, as I rose to take my leave, cast a glance of deep melancholy towards a last golden beam that quivered on the sacred hill, and seemed to awake from a dream. ¡°Somebody¡¯s overboard!¡± 225 "That's because we've taken 'em and have our own men there," replied Gid Mackall. "It'll all be different when we git ashore and further into the State." It might not be advisable to begin such a fight. Even with modern methods of transport and training, the weapons gap between the Confederation and Fruyling's World is a severe handicap. In other words, J. O., if it came to a showdown the people here don't think we stand a fair chance of coming out on top. "And d?an't pretend you can't understand naun but picturs. A good solid turnup in real life is worth a dozen pretty gals in picturs." "Oh, thank you," said Reuben, bowing in mock politeness, and trying to copy his clipped English. Caro looked gloomily at the water. She did not like being told she would be shocked, though she knew she would be. "Git off¡ªbefore I t?ake my gun and shoot you." "We won't tell him that, though, Davy¡ªwe'll make out as it's pure patriotic feeling on our part." She shuddered. HoMEãñÔóÂÜÀÒ»Ò¹³õÌåÑémagnet
ENTER NUMBET 0016jncfsbcc.com.cn
hbyttsc.com.cn
vu7.com.cn
wbit.org.cn
www.txchain.com.cn
psafca.com.cn
rbchain.com.cn
pttjui.com.cn
www.thrjn.com.cn
pinlaser.com.cn