I give up for now, might come back to this later
This commit is contained in:
parent
713d7e8d1d
commit
aa84238845
|
@ -1,2 +1,3 @@
|
|||
# python.htformtool
|
||||
# HTFormTool
|
||||
|
||||
An attempt at passing off a very basic CLI tool as some sort of web browser.
|
||||
|
|
|
@ -0,0 +1,493 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import click
|
||||
|
||||
from enum import Enum
|
||||
|
||||
VERSION = '0.1.0'
|
||||
|
||||
sess = requests.Session()
|
||||
base_headers = {
|
||||
# request (x)html form
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
'User-Agent': 'htformtool/{version}'.format(version=VERSION),
|
||||
}
|
||||
#
|
||||
post_headers = {
|
||||
# request confirmation code
|
||||
'Accept': 'text/plain',
|
||||
}
|
||||
|
||||
def hide_ua(ctx, param, value):
|
||||
if not value or ctx.resilient_parsing:
|
||||
return
|
||||
base_headers['User-Agent'] = None
|
||||
|
||||
def split_on_ascii_whitespace(inp):
|
||||
start_position = 0
|
||||
end_position = 0
|
||||
tokens = []
|
||||
while start_position < len(inp) and inp[start_position] in '\x09\x0A\x0C\x0D\x20':
|
||||
start_position = start_position + 1
|
||||
while start_position < len(inp):
|
||||
end_position = start_position
|
||||
while end_position < len(inp) and inp[end_position] not in '\x09\x0A\x0C\x0D\x20':
|
||||
end_position = end_position + 1
|
||||
tokens.append(inp[start_position:end_position])
|
||||
start_position = end_position
|
||||
while start_position < len(inp) and inp[start_position] in '\x09\x0A\x0C\x0D\x20':
|
||||
start_position = start_position + 1
|
||||
return tokens
|
||||
|
||||
def ascii_lowercase(s):
|
||||
import string
|
||||
return s.translate(str.maketrans(string.ascii_uppercase, string.ascii_lowercase))
|
||||
|
||||
def get_encoding(label):
|
||||
# fuck
|
||||
if ascii_lowercase(label) not in ('unicode-1-1-utf-8', 'utf-8', 'utf8'):
|
||||
raise NotImplementedError
|
||||
import codecs
|
||||
return codecs.lookup('utf-8')
|
||||
|
||||
import re
|
||||
newline_normalize = re.compile('\x0D(?!\x0A)|(?<!\x0D)\x0A')
|
||||
|
||||
def append_an_entry(l, name, value, no_line_break_normalization=False):
|
||||
# TODO might not be *strictly* correct
|
||||
name = newline_normalize.sub('\r\n', name)
|
||||
try:
|
||||
if not no_line_break_normalization:
|
||||
value = newline_normalize.sub('\r\n', value)
|
||||
except ValueError:
|
||||
pass
|
||||
l.append((name, value))
|
||||
|
||||
class FieldState(Enum):
|
||||
# normal <input> types
|
||||
HIDDEN = 'hidden'
|
||||
TEXT = 'text'
|
||||
SEARCH = 'search'
|
||||
TELEPHONE = 'tel'
|
||||
URL = 'url'
|
||||
EMAIL = 'email'
|
||||
PASSWORD = 'password'
|
||||
DATE = 'date'
|
||||
MONTH = 'month'
|
||||
WEEK = 'week'
|
||||
TIME = 'time'
|
||||
LOCAL_DATE_AND_TIME = 'datetime-local'
|
||||
NUMBER = 'number'
|
||||
RANGE = 'range'
|
||||
COLOR = 'color'
|
||||
CHECKBOX = 'checkbox'
|
||||
RADIO = 'radio'
|
||||
FILE = 'file'
|
||||
SUBMIT = 'submit'
|
||||
IMAGE = 'image'
|
||||
RESET = 'reset'
|
||||
BUTTON = 'button'
|
||||
|
||||
# custom, htformtool-specific <input> types
|
||||
CREDENTIALS = 'credentials'
|
||||
|
||||
# non-<input> types
|
||||
TEXTAREA = 'textarea'
|
||||
SELECT = 'select'
|
||||
|
||||
# <button> types
|
||||
BSUBMIT = 'bsubmit'
|
||||
BRESET = 'breset'
|
||||
BBUTTON = 'bbutton'
|
||||
|
||||
def is_button(self, submitter=None):
|
||||
if self in (FieldState.BSUBMIT, FieldState.IMAGE, FieldState.SUBMIT):
|
||||
return submitter is None or submitter == True
|
||||
if self in (FieldState.BRESET, FieldState.BBUTTON, FieldState.RESET, FieldState.BUTTON):
|
||||
return submitter is None or submitter == False
|
||||
return False
|
||||
|
||||
def blocks_implicit_submission(self):
|
||||
return self in (FieldState.TEXT, FieldState.SEARCH, FieldState.URL, FieldState.TELEPHONE,
|
||||
FieldState.EMAIL, FieldState.PASSWORD, FieldState.DATE, FieldState.MONTH,
|
||||
FieldState.WEEK, FieldState.TIME, FieldState.LOCAL_DATE_AND_TIME, FieldState.NUMBER)
|
||||
|
||||
class ConstraintError(ValueError):
|
||||
pass
|
||||
|
||||
class FormData:
|
||||
"""
|
||||
Represents the data to be submitted by the form.
|
||||
"""
|
||||
|
||||
def __init__(self, encoding, entry_list, action, enctype, method, target):
|
||||
self.encoding = encoding
|
||||
"""The codec object that should be used to encode the form for sending"""
|
||||
self.entry_list = entry_list
|
||||
"""The entry list"""
|
||||
self.action = action
|
||||
"""The form's raw action (URL) (not parsed)"""
|
||||
self.enctype = enctype
|
||||
"""The form's enctype"""
|
||||
self.method = method
|
||||
"""The form's method (not sanitized)"""
|
||||
self.target = target
|
||||
"""The form's target (not sanitized)"""
|
||||
|
||||
class Form:
|
||||
def __init__(self, form):
|
||||
self.form = form
|
||||
# these have the same length
|
||||
self.elements = []
|
||||
self.fields = []
|
||||
|
||||
def submit(self, document_encoding, submitter=None):
|
||||
"""
|
||||
Submits the form implicitly, or with the given submitter.
|
||||
|
||||
Raises ValueError if the given submitter isn't a valid submitter.
|
||||
|
||||
Raises ConstraintError if this field's no-validate state is false and one or more of the form's fields is invalid. (note: ConstraintError is a subtype of ValueError)
|
||||
|
||||
Returns a FormData object, or None if implicit submission is not allowed.
|
||||
"""
|
||||
if submitter is not None:
|
||||
if not submitter in self.fields:
|
||||
raise ValueError
|
||||
if not submitter.is_button(submitter=True):
|
||||
raise ValueError
|
||||
if not submitter.no_validate():
|
||||
for field in self.fields:
|
||||
field.check_value()
|
||||
elif not self.form.get('novalidate'):
|
||||
blocks_implicit_submission = 0
|
||||
for field in self.fields:
|
||||
field.check_value()
|
||||
if submitter is None:
|
||||
if field.is_button(submitter=True):
|
||||
blocks_implicit_submission = 0
|
||||
submitter = field
|
||||
elif field._blocks_implicit_submission():
|
||||
blocks_implicit_submission += 1
|
||||
if blocks_implicit_submission > 1:
|
||||
return None
|
||||
|
||||
encoding = document_encoding
|
||||
if self.form.get('accept-charset') is not None:
|
||||
candidate_enc_labels = split_on_ascii_whitespace(self.form['accept-charset'])
|
||||
candidate_enc = []
|
||||
for token in candidate_enc_labels:
|
||||
enc = get_encoding(token)
|
||||
if enc is not None:
|
||||
candidate_enc.append(enc)
|
||||
if not candidate_enc:
|
||||
encoding = get_encoding('utf-8')
|
||||
else:
|
||||
encoding = candidate_enc[0]
|
||||
|
||||
controls = self.fields
|
||||
entry_list = []
|
||||
for field in controls:
|
||||
if field.is_button() and field is not submitter:
|
||||
continue
|
||||
if field.is_checkable() and not field.is_checked():
|
||||
continue
|
||||
if field.is_image_button():
|
||||
name = field.field['name'] + '.' if field.field.get('name') else ''
|
||||
namex = name + 'x'
|
||||
namey = name + 'y'
|
||||
append_an_entry(entry_list, namex, 0)
|
||||
append_an_entry(entry_list, namey, 0)
|
||||
continue
|
||||
name = field.field['name']
|
||||
if field.is_select():
|
||||
for option in field.get_options():
|
||||
raise NotImplementedError
|
||||
elif field.is_checkable():
|
||||
append_an_entry(entry_list, name, field.get_value())
|
||||
elif field.is_file():
|
||||
raise NotImplementedError
|
||||
elif field.is_hidden() and name == '_charset_':
|
||||
raise NotImplementedError
|
||||
elif field.is_textarea():
|
||||
raise NotImplementedError
|
||||
else:
|
||||
append_an_entry(name, field.get_value())
|
||||
if field.has_valid_dirname():
|
||||
dirname = field.field['dirname']
|
||||
raise NotImplementedError
|
||||
|
||||
action = None
|
||||
if submitter is not None and submitter.field.get('formaction') is not None:
|
||||
action = submitter.field['formaction']
|
||||
if action is None and self.form.get('action'):
|
||||
action = self.form['action']
|
||||
if action is None:
|
||||
action = ''
|
||||
|
||||
enctype = None
|
||||
if submitter is not None and submitter.field.get('formenctype') is not None:
|
||||
enctype = submitter.field['formenctype']
|
||||
if enctype is None and self.form.get('enctype'):
|
||||
enctype = self.form['enctype']
|
||||
enctype = ascii_lowercase(enctype)
|
||||
if enctype not in ('application/x-www-form-urlencoded', 'multipart/form-data', 'text/plain'):
|
||||
enctype = 'application/x-www-form-urlencoded'
|
||||
|
||||
method = None
|
||||
if submitter is not None and submitter.field.get('formmethod') is not None:
|
||||
method = submitter.field['formmethod']
|
||||
if method is None and self.form.get('method'):
|
||||
method = self.form['method']
|
||||
method = ascii_lowercase(method)
|
||||
if method not in ('get', 'post', 'dialog'):
|
||||
method = 'get'
|
||||
|
||||
# WARNING: NOT SANITIZED
|
||||
target = None
|
||||
if submitter is not None and submitter.field.get('formtarget') is not None:
|
||||
target = submitter.field['formtarget']
|
||||
else:
|
||||
if self.form.get('target') is not None:
|
||||
target = self.form['target']
|
||||
elif self.form.find_parent('[document]').base is not None and self.form.find_parent('[document]').base.get('target') is not None:
|
||||
target = self.form.find_parent('[document]').base['target']
|
||||
else:
|
||||
target = ''
|
||||
|
||||
return FormData(encoding, entry_list, action, enctype, method, target)
|
||||
|
||||
class FormField:
|
||||
def __init__(self, form, field):
|
||||
self.field = field
|
||||
self.form = form
|
||||
form.fields.append(self)
|
||||
|
||||
if field.name == 'input':
|
||||
ftype = ascii_lowercase(field.get('type', 'text'))
|
||||
if ftype not in ('textarea', 'selection', 'bbutton', 'bsubmit', 'breset'):
|
||||
try:
|
||||
state = FieldState(ftype)
|
||||
except ValueError:
|
||||
state = FieldState.TEXT
|
||||
else:
|
||||
state = FieldState.TEXT
|
||||
elif field.name == 'button':
|
||||
ftype = ascii_lowercase(field.get('type', 'submit'))
|
||||
if ftype == 'reset':
|
||||
state = FieldState.BRESET
|
||||
elif ftype == 'button':
|
||||
state = FieldState.BBUTTON
|
||||
else:
|
||||
state = FieldState.BSUBMIT
|
||||
elif field.name == 'textarea':
|
||||
state = FieldState.TEXTAREA
|
||||
elif field.name == 'select':
|
||||
state = FieldName.SELECT
|
||||
self.state = state
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
if self.is_checkable():
|
||||
self._checked = self.field.get('checked') is not None
|
||||
if self.is_select():
|
||||
self.options = []
|
||||
raise NotImplementedError
|
||||
if self._is_text_based():
|
||||
self.text = self.field.get('value') or ''
|
||||
|
||||
def _is_text_based(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def is_button(self, submitter=None):
|
||||
self.state.is_button(submitter=submitter)
|
||||
|
||||
def blocks_implicit_submission(self):
|
||||
self.state.blocks_implicit_submission()
|
||||
|
||||
def is_textarea(self):
|
||||
return self.state == FieldState.TEXTAREA
|
||||
|
||||
def is_image_button(self):
|
||||
return self.state == FieldState.IMAGE
|
||||
|
||||
def is_radio(self, name=None):
|
||||
return self.state == FieldState.RADIO and (name is None or self.field.get('name') == name)
|
||||
|
||||
def is_file(self):
|
||||
return self.state == FieldState.FILE
|
||||
|
||||
def is_checkable(self):
|
||||
return self.state in (FieldState.CHECKBOX, FieldState.RADIO)
|
||||
|
||||
def is_checked(self):
|
||||
if not self.is_checkable():
|
||||
raise ValueError
|
||||
return self._checked
|
||||
|
||||
def is_select(self):
|
||||
return self.field.name == 'select'
|
||||
|
||||
def has_valid_dirname(self):
|
||||
raise NotImplementedError
|
||||
#return (self.is_textarea() or (self.field.name == 'input' and self.field.get('type') not in NOT_TEXT)) and bool(self.field.get('dirname'))
|
||||
|
||||
def get_options(self, selected=None, disabled=False):
|
||||
if not self.is_select():
|
||||
raise ValueError
|
||||
raise NotImplementedError
|
||||
|
||||
def set_checked(self, checkedness=True):
|
||||
if not self.is_checkable():
|
||||
raise ValueError
|
||||
if self.is_radio():
|
||||
if not checkedness:
|
||||
raise ValueError # you can't really un-check a radio
|
||||
for field in self.form.fields:
|
||||
if field.is_radio(name=self.field['name']) and field.is_checked():
|
||||
field._checked = False
|
||||
self._checked = checkedness
|
||||
|
||||
def set_value(self, value):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_value(self):
|
||||
if self.is_checkable():
|
||||
return self.field.get('value', default='on')
|
||||
raise NotImplementedError
|
||||
|
||||
def check_value(self):
|
||||
"""
|
||||
Checks if the value in this field satisfies the constraints.
|
||||
|
||||
Raises ConstraintError if it doesn't.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def no_validate(self):
|
||||
"""
|
||||
Returns this element's no-validate state.
|
||||
|
||||
Raises ValueError if this element is not a submitter button.
|
||||
"""
|
||||
if not self.is_button(submitter=True):
|
||||
raise ValueError
|
||||
if self.field.get('formnovalidate') is not None:
|
||||
return True
|
||||
if self.form.form.get('novalidate') is not None:
|
||||
return True
|
||||
return False
|
||||
|
||||
def submit(self, document_encoding):
|
||||
"""
|
||||
Submits the form with this field as submitter.
|
||||
|
||||
Raises ValueError if this field isn't a valid submitter.
|
||||
|
||||
Raises ConstraintError if this field's no-validate state is false and one or more of the form's fields is invalid. (note: ConstraintError is a subtype of ValueError)
|
||||
|
||||
Returns the form data in the correct serialization format, as specified by the form element.
|
||||
"""
|
||||
self.form.submit(document_encoding=document_encoding, submitter=self)
|
||||
|
||||
def extract_forms(document):
|
||||
forms = [Form(form) for form in document.find_all('form')]
|
||||
forms_with_id = {form.form['id']: form for form in forms if form.form.get('id') is not None}
|
||||
forms_by_object = {form.form: form for form in forms}
|
||||
for element in document.find_all(['button', 'input', 'select', 'textarea']):
|
||||
element_form = element.get('form')
|
||||
if element_form is not None:
|
||||
element_form = forms_with_id.get(element_form)
|
||||
if element_form is None:
|
||||
element_form = element.find_parent('form')
|
||||
if element_form is not None:
|
||||
element_form = forms_by_object[element_form]
|
||||
#element_form = element.parent
|
||||
#while element_form is not None:
|
||||
# if element_form.name == 'form':
|
||||
# element_form = forms_by_object[element_form]
|
||||
# break
|
||||
if element_form is not None:
|
||||
element_form.elements.append(element)
|
||||
# we don't need these anymore
|
||||
del forms_with_id
|
||||
del forms_by_object
|
||||
|
||||
# disabled controls
|
||||
def should_keep(field):
|
||||
def is_disabled(field):
|
||||
if field.get('disabled') is not None:
|
||||
return True
|
||||
for parent in field.find_parents('fieldset'):
|
||||
if parent.get('disabled') is not None:
|
||||
if parent.legend is not None and parent.legend in field.parents:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
return False
|
||||
|
||||
if is_disabled(field):
|
||||
return False
|
||||
if (not (field.name == 'input' and field.get('type') == 'image')) and not field.get('name'):
|
||||
return False
|
||||
# weird edge-case
|
||||
if field.get('name') == 'isindex':
|
||||
return False
|
||||
return True
|
||||
|
||||
# clean up
|
||||
for form in forms:
|
||||
# TODO clean this up
|
||||
newelements = [FormField(form, field) for field in form.elements if should_keep(field)]
|
||||
form.elements = [ff.field for ff in newelements]
|
||||
|
||||
return forms
|
||||
|
||||
@click.command()
|
||||
@click.option('--encoding', default='', help='Overrides the character encoding of the document.', metavar='ENCODING')
|
||||
@click.option('--fallback-encoding', default='utf-8', help='Sets the encoding used if the encoding can\'t be determined by another means. Ignored if --encoding is used.', show_default=True, metavar='ENCODING')
|
||||
@click.option('--hide-ua', is_flag=True, help='Prevent sending the User-Agent string.', expose_value=False, is_eager=True, callback=hide_ua)
|
||||
@click.version_option(version=VERSION, prog_name="htformtool")
|
||||
@click.argument('url')
|
||||
def htformtool(encoding, fallback_encoding, url):
|
||||
sess.headers.update(base_headers)
|
||||
sess.headers.update({'Accept-Charset': encoding or fallback_encoding})
|
||||
r = sess.get(url)
|
||||
try:
|
||||
if not click.confirm("Using {}, continue?".format(r.url), default=True):
|
||||
return
|
||||
except click.Abort:
|
||||
click.echo("") # just a newline
|
||||
return
|
||||
try:
|
||||
if r.headers['Content-Type'].startswith('text/html'):
|
||||
isXML = False
|
||||
elif r.headers['Content-Type'].startswith('application/xhtml+xml'):
|
||||
isXML = True
|
||||
else:
|
||||
# not an acceptable content type, ignore it.
|
||||
click.echo("No HTML found.")
|
||||
return
|
||||
except IndexError:
|
||||
# no content-type, just assume HTML. it's probably good enough for our needs.
|
||||
isXML = None
|
||||
if encoding:
|
||||
r.encoding = encoding
|
||||
if r.encoding is None:
|
||||
# FIXME
|
||||
r.encoding = fallback_encoding
|
||||
soup = BeautifulSoup(r.content, "xml" if isXML else "html5lib")
|
||||
forms = extract_forms(soup)
|
||||
if not forms:
|
||||
click.echo("No forms found.")
|
||||
else:
|
||||
click.echo("There are {} forms:".format(len(forms)))
|
||||
# TODO
|
||||
click.echo(forms)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
htformtool()
|
|
@ -0,0 +1,5 @@
|
|||
beautifulsoup4==4.7.1
|
||||
Click==7.0
|
||||
html5lib==1.0.1
|
||||
requests==2.21.0
|
||||
lxml==4.3.0
|
Loading…
Reference in New Issue