使用 FigureWidget ipywidgets 进行交互式数据分析

import datetime
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from ipywidgets import widgets

NYC Flights Database

我们会申请查看 2013 年所有离开纽约的航班的延误情况。

df = pd.read_csv(
    'https://media.githubusercontent.com/media/xinet-collections/test-dastsets/main/nycflights.csv')
df = df.drop(df.columns[[0]], axis=1)
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-2-0d6ad44553ee> in <module>
----> 1 df = pd.read_csv(
      2     'https://media.githubusercontent.com/media/xinet-collections/test-dastsets/main/nycflights.csv')
      3 df = df.drop(df.columns[[0]], axis=1)

/usr/share/miniconda/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    584     kwds.update(kwds_defaults)
    585 
--> 586     return _read(filepath_or_buffer, kwds)
    587 
    588 

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    480 
    481     # Create the parser.
--> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
    483 
    484     if chunksize or iterator:

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
    809             self.options["has_index_names"] = kwds["has_index_names"]
    810 
--> 811         self._engine = self._make_engine(self.engine)
    812 
    813     def close(self):

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
   1038             )
   1039         # error: Too many arguments for "ParserBase"
-> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1041 
   1042     def _failover_to_python(self):

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py in __init__(self, src, **kwds)
     49 
     50         # open handles
---> 51         self._open_handles(src, kwds)
     52         assert self.handles is not None
     53 

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py in _open_handles(self, src, kwds)
    220         Let the readers open IOHandles after they are done with their potential raises.
    221         """
--> 222         self.handles = get_handle(
    223             src,
    224             "r",

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    606 
    607     # open URLs
--> 608     ioargs = _get_filepath_or_buffer(
    609         path_or_buf,
    610         encoding=encoding,

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/common.py in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    309         # assuming storage_options is to be interpreted as headers
    310         req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 311         with urlopen(req_info) as req:
    312             content_encoding = req.headers.get("Content-Encoding", None)
    313             if content_encoding == "gzip":

/usr/share/miniconda/lib/python3.8/site-packages/pandas/io/common.py in urlopen(*args, **kwargs)
    209     import urllib.request
    210 
--> 211     return urllib.request.urlopen(*args, **kwargs)
    212 
    213 

/usr/share/miniconda/lib/python3.8/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

/usr/share/miniconda/lib/python3.8/urllib/request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

/usr/share/miniconda/lib/python3.8/urllib/request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

/usr/share/miniconda/lib/python3.8/urllib/request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

/usr/share/miniconda/lib/python3.8/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

/usr/share/miniconda/lib/python3.8/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found
df.sample(3)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-b81a29911479> in <module>
----> 1 df.sample(3)

NameError: name 'df' is not defined

让我们获得所有 airlines 的集合,以便稍后在搜索框中输入正确的内容。

df['carrier'].unique()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-8d832dfc27ce> in <module>
----> 1 df['carrier'].unique()

NameError: name 'df' is not defined

让我们来分配我们将在应用程序中使用的小部件。一般来说,所有这些小部件都将用于筛选数据集,从而实现可视化。

month = widgets.IntSlider(
    value=1.0,
    min=1.0,
    max=12.0,
    step=1.0,
    description='Month:',
    continuous_update=False
)

use_date = widgets.Checkbox(
    description='Date: ',
    value=True,
)

container = widgets.HBox(children=[use_date, month])

textbox = widgets.Dropdown(
    description='Airline:   ',
    value='DL',
    options=df['carrier'].unique().tolist()
)

origin = widgets.Dropdown(
    options=list(df['origin'].unique()),
    value='LGA',
    description='Origin Airport:',
)


# Assign an empty figure widget with two traces
trace1 = go.Histogram(x=df['arr_delay'], opacity=0.75, name='Arrival Delays')
trace2 = go.Histogram(x=df['dep_delay'], opacity=0.75, name='Departure Delays')
g = go.FigureWidget(data=[trace1, trace2],
                    layout=go.Layout(
                        title=dict(
                            text='NYC FlightDatabase'
                        ),
                        barmode='overlay'
                    ))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-ffdd5064d3eb> in <module>
     18     description='Airline:   ',
     19     value='DL',
---> 20     options=df['carrier'].unique().tolist()
     21 )
     22 

NameError: name 'df' is not defined

现在让我们编写一个函数来处理来自小部件的输入,并改变 graph 的状态。

def validate():
    if origin.value in df['origin'].unique() and textbox.value in df['carrier'].unique():
        return True
    else:
        return False


def response(change):
    if validate():
        if use_date.value:
            filter_list = [i and j and k for i, j, k in
                           zip(df['month'] == month.value, df['carrier'] == textbox.value,
                               df['origin'] == origin.value)]
            temp_df = df[filter_list]

        else:
            filter_list = [i and j for i, j in
                           zip(df['carrier'] == 'DL', df['origin'] == origin.value)]
            temp_df = df[filter_list]
        x1 = temp_df['arr_delay']
        x2 = temp_df['dep_delay']
        with g.batch_update():
            g.data[0].x = x1
            g.data[1].x = x2
            g.layout.barmode = 'overlay'
            g.layout.xaxis.title = 'Delay in Minutes'
            g.layout.yaxis.title = 'Number of Delays'


origin.observe(response, names="value")
textbox.observe(response, names="value")
month.observe(response, names="value")
use_date.observe(response, names="value")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-87aeb724f65f> in <module>
     28 
     29 
---> 30 origin.observe(response, names="value")
     31 textbox.observe(response, names="value")
     32 month.observe(response, names="value")

NameError: name 'origin' is not defined

是时候试试这个应用了!!

container2 = widgets.HBox([origin, textbox])
widgets.VBox([container,
              container2,
              g])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-ca1441aa06fd> in <module>
----> 1 container2 = widgets.HBox([origin, textbox])
      2 widgets.VBox([container,
      3               container2,
      4               g])

NameError: name 'origin' is not defined
https://media.githubusercontent.com/media/xinet-collections/test-dastsets/main/tests/figurewidget-app.gif