Is there a better way to write this piece of code to extract data from a REST API with the requests library? It currently takes about 10 minutes to finish extracting the data from the REST API. I think the code below may not be the most efficient method.
import time
import requests
import datetime
import pandas as pd
def loan_rest_api():
'''
This function calls the rest api and stored the data in a pandas dataframe.
'''
search_start = time.time()
print("Extracting loan data from Rest API...")
# Declaring variables for the first page number and the list that contains the results for each page in the REST API.
num = 1
pages = []
# Declaring variable to look up the correct table in the REST API.
table = 'loan'
# The 'loan' table has about 50 columns, but these are the ones I need.
# Declaring columns variable to choose the columns I'm interested in.
columns = '''
loan_id,
loan_name,
loan_parent_id,
loan_status,
loan_type,
loan_classification,
loan_create_date,
loan_last_update,
loan_amount,
loan_amount_last_update,
loan_balance,
loan_balance_last_update,
loan_interest,
loan_interest_last_update,
city,
office_location,
project_name
'''
# Infinitely loop needed to iterate through all of the pages of data that the loan
# table contains. Since I don't know how many pages there are, I loop
# through the entire table until it reaches the end. Once
# it reaches the end, the While True condition no longer holds so the loop stops.
while True:
# Since the loan table in the rest api contains several pages, I had to implement a
# pagination functionality to prevent the function from either timing out or crashing.
pagination = {'pageSize': '5000',
'pageNumber': str(num),
'attributes': columns.replace(' ', '').replace('\n', '')}
# Send a GET request to the REST API on the corresponding loan table.
# the params parameter contains the pagination requirements. Start
# with page 1, then page 2, etc. For each resulting data pull, append
# the page to the pages list.
# num increase by 1 on the next loop until it reaches the end.
response = requests.get(REST_API_LINK + table, params=pagination)
if response.status_code != 200:
break
results = response.json()
pages.append(pd.DataFrame(results))
num += 1
# Create a pandas dataframe. This contains about 200,000 rows and it's
# 40MB in size when exported as a csv file.
final_df = pd.concat(pages, ignore_index=True)
# Right now this takes about 800 seconds.
search_end = time.time()
print(f" ...search completed in {search_end - search_start: .2f} seconds.")
return print('Search Done.')
```