
    U th؉                       U d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	 d dl
mZ ddlmZ  G d de	          Z ed	d
          ZdGdZdHdZddgfdIdZdJdKdZddgfdIdZdGdZdGdZdGdZdGdZdGdZdGd ZdLd$ZddgfdMd'ZdGd(ZdNd*ZdOd.Z dPd0Z!dQd2Z"dRd4Z#dSd6Z$dTd8Z%dUd9Z&dVdWd=Z'd>Z(d?e)d@<   dXdBZ*dYdFZ+dS )Z    )annotationsN)AnyTypeVarCallableOptional
NamedTuple)	TypeAlias   )pandasc                  n    e Zd ZU ded<   dZded<   dZded<   dZded<   dZded	<   dZded
<   dZ	ded<   dS )RemediationstrnameNzOptional[str]immediate_msgnecessary_msgzOptional[Callable[[Any], Any]]necessary_fnoptional_msgoptional_fn	error_msg)
__name__
__module____qualname____annotations__r   r   r   r   r   r        j/var/www/html/mycamper/aliexpress-site/backend/venv/lib/python3.11/site-packages/openai/lib/_validators.pyr   r      s         III#'M''''#'M''''37L7777"&L&&&&26K6666#I######r   r   OptionalDataFrameTzOptional[pd.DataFrame])bounddfpd.DataFramereturnc                    d}t          |           |k    rdnd}dt          |            d| }t          d|          S )z
    This validator will only print out the number of examples and recommend to the user to increase the number of examples if less than 100.
    d    z. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examplesz
- Your file contains z prompt-completion pairsnum_examplesr   r   )lenr   )r   MIN_EXAMPLESoptional_suggestionr   s       r   num_examples_validatorr*      s`     L r77l"" 	 w 
 ec"ggddObddMN-HHHHr   necessary_columnr   c                    ddd}d}d}d}| j         vr/d | j         D             v rdfd	}|}d
 d}d d}nd d}t          d||||          S )z[
    This validator will ensure that the necessary column is present in the dataframe.
    r   r    columnr   r!   c                    fd| j         D             }|                     |d                                         id           | S )Nc                ^    g | ])}t          |                                          k    '|*S r   r   lower).0cr-   s     r   
<listcomp>zInecessary_column_validator.<locals>.lower_case_column.<locals>.<listcomp>-   s1    BBBaQ6)A)A)A)A)Ar   r   T)columnsinplace)r5   renamer1   )r   r-   colss    ` r   lower_case_columnz5necessary_column_validator.<locals>.lower_case_column,   sJ    BBBB2:BBB
		47FLLNN3T	BBB	r   Nc                P    g | ]#}t          |                                          $S r   r0   r2   r3   s     r   r4   z.necessary_column_validator.<locals>.<listcomp>7   s&    CCC1ACCCr   c                     |           S Nr   )r   r9   r+   s    r   lower_case_column_creatorz=necessary_column_validator.<locals>.lower_case_column_creator9   s    ((-=>>>r   z
- The `z ` column/key should be lowercasezLower case column name to ``z^` column/key is missing. Please make sure you name your columns/keys appropriately, then retryr+   )r   r   r   r   r   )r   r    r-   r   r!   r    )r   r    r!   r    )r5   r   )r   r+   r   r   r   r   r>   r9   s    `     @r   necessary_column_validatorr@   '   s    
   
 MLMIrz))CC
CCCCC? ? ? ? ? ? ? 5LZ(8ZZZMM:JMMMMM M,  M  M  MI##!   r   prompt
completionfields	list[str]c                   g }d}d}d}t          | j                  dk    rYfd| j        D             }d}|D ]/fd|D             }t          |          dk    r|d d d	z  }0d
| | }d| }dfd}t          d|||          S )zK
    This validator will remove additional columns from the dataframe.
    Nr
   c                    g | ]}|v|	S r   r   )r2   r3   rC   s     r   r4   z/additional_column_validator.<locals>.<listcomp>U   s    GGGAqar   r$   c                    g | ]}|v |	S r   r   )r2   r3   acs     r   r4   z/additional_column_validator.<locals>.<listcomp>X   s    ===!R1WWAWWWr   r   z9
  WARNING: Some of the additional columns/keys contain `z<` in their name. These will be ignored, and the column/key `z`` will be used instead. This could also result from a duplicate column/key in the provided file.zh
- The input file should contain exactly two columns/keys per row. Additional columns/keys present are: z Remove additional columns/keys: xr   r!   c                    |          S r=   r   rI   rC   s    r   r   z1additional_column_validator.<locals>.necessary_fn^   s    V9r   additional_columnr   r   r   r   rI   r   r!   r   )r'   r5   r   )	r   rC   additional_columnsr   r   r   warn_messagedupsrH   s	    `      @r   additional_column_validatorrR   K   s<    MML
2:GGGGGGG$ 	B 	BB====1===D4yy1}}  !B]_  !B  !B  ^`  !B  !B  !B  B g  EW  g  Ye  g  gO;MOO	 	 	 	 	 	  ##!	   r   fieldc                   d}d}d}|                               d                                           s,|                                                                          r||          dk    |                                          z  }|                                 j        |                                         }d d| }dfd	}d
t          |           d d}t          d |||          S )zA
    This validator will ensure that no completion is empty.
    Nc                    | dk    S )Nr$   r   rI   s    r   <lambda>z+non_empty_field_validator.<locals>.<lambda>q   s
    b r   r$   z
- `z?` column/key should not contain empty strings. These are rows: rI   r   r!   c                R    | |          dk                                  g          S )Nr$   subset)dropna)rI   rS   s    r   r   z/non_empty_field_validator.<locals>.necessary_fnv   s)    QuX^$++E7+;;;r   Remove z rows with empty sempty_rM   rN   )applyanyisnullreset_indexindextolistr'   r   )r   rS   r   r   r   
empty_rowsempty_indexess    `     r   non_empty_field_validatorrg   i   s(    MLM	%y(())--// P2e93C3C3E3E3I3I3K3K Pi2o"U)*:*:*<*<=
((.z:AACCuuufsuu	< 	< 	< 	< 	< 	< P#m"4"4OOuOOOe##!	   r   c                r   |                                }|                                 j        |                                         }d}d}d}t	          |          dk    rDdt	          |           dd                               d| }dt	          |           d	}dfd}t          d|||          S )zY
    This validator will suggest to the user to remove duplicate rows if they exist.
    rY   Nr   
- There are z duplicated -z sets. These are rows: r\   z duplicate rowsrI   r   r!   c                0    |                                S )NrY   )drop_duplicatesrK   s    r   r   z.duplicated_rows_validator.<locals>.optional_fn   s    $$F$333r   duplicated_rowsr   r   r   r   rN   )
duplicatedrb   rc   rd   r'   joinr   )r   rC   rm   duplicated_indexesr   r   r   s    `     r   duplicated_rows_validatorrr      s    mm6m22O))/@GGIIMLK
"" L-?)@)@  L  LchhW]N^N^  L  L  xJ  L  LI%7!8!8III	4 	4 	4 	4 	4 	4 #!	   r   c                   d}d}d}t          |           }|dk    rRdd |           t                    dk    r0d	t                     d
 d}dt                     d}dfd}t          d|||          S )zW
    This validator will suggest to the user to remove examples that are too long.
    Nopen-ended generationdr    r!   r   c                    |                      d d          }|                                 j        |                                         S )Nc                \    t          | j                  t          | j                  z   dk    S )Ni'  )r'   rA   rB   rV   s    r   rW   zClong_examples_validator.<locals>.get_long_indexes.<locals>.<lambda>   s#    c!(mmc!,>O>O.ORW.W r      )axis)r_   rb   rc   rd   )ru   long_exampless     r   get_long_indexesz1long_examples_validator.<locals>.get_long_indexes   s>    GG$W$W^_G``M==??(7>>@@@r   r   ri   z. examples that are very long. These are rows: zf
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.r\   z long examplesrI   c                     |           }|k    r3t           j                            dt          |           d| d           |                     |          S )NzeThe indices of the long examples has changed as a result of a previously applied recommendation.
The z? long examples to be dropped are now at the following indices: 
)sysstdoutwriter'   drop)rI   long_indexes_to_dropr{   long_indexess     r   r   z,long_examples_validator.<locals>.optional_fn   s    '7'7':':$#777J$$ t  BE  FZ  B[  B[  t  t  \p  t  t  t   vv2333r   rz   rn   )ru   r    r!   r   rN   )infer_task_typer'   r   )r   r   r   r   ft_typer{   r   s        @@r   long_examples_validatorr      s     MLKb!!G)))	A 	A 	A 	A ('++|q   dS->->  d  dnz  d  d  dMFS%6%6FFFL4 4 4 4 4 4 4 #!	   r   c                   d}d}d}d}dg d}|D ]p}|dk    r2| j         j                            d                                          r:| j         j                            |d                                          rn|                     dd          }t          |           }|d	k    rt          d
          S d"dt          | j         d          }	| j         |	k                                    rd|	 d}t          d
|          S |	dk    r|	                    dd          }
d|
 d}t          |	          dk    r	|d| dz  }| j         j        dt          |	                    j                            |	d                                          r	|d|	 dz  }nd}|	dk    rd| d}d#fd}t          d ||||!          S )$z
    This validator will suggest to add a common suffix to the prompt if one doesn't already exist in case of classification or conditional generation.
    Nz


### =>

) ->z

###

z

===

z

---

z

===>

z

--->

r   r}   Fregex\nrt   common_suffixr   rI   r   suffixr!   c                &    | dxx         |z  cc<   | S NrA   r   rI   r   s     r   
add_suffixz2common_prompt_suffix_validator.<locals>.add_suffix   s    	(vr   xfixzAll prompts are identical: `zt`
Consider leaving the prompts blank if you want to do open-ended generation, otherwise ensure prompts are differentr   r   r$   z 
- All prompts end with suffix `r?   
   R. This suffix seems very long. Consider replacing with a shorter suffix, such as `z5
  WARNING: Some of your prompts contain the suffix `zZ` more than once. We strongly suggest that you review your prompts and add a unique suffixa  
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts emptyzAdd a suffix separator `z` to all promptsc                     |           S r=   r   rI   r   suggested_suffixs    r   r   z3common_prompt_suffix_validator.<locals>.optional_fn       :a!1222r   common_completion_suffixr   r   r   r   r   rI   r   r   r   r!   r   rN   )
rA   r   containsr`   replacer   r   get_common_xfixallr'   )r   r   r   r   r   suffix_optionssuffix_optiondisplay_suggested_suffixr   r   common_suffix_new_line_handledr   r   s              @@r   common_prompt_suffix_validatorr      s    IMLK (  N (  E!!y}%%d++//11 9=!!-u!==AACC 	(/77eDDb!!G)))0000    $BIH===M
	]"'')) F x=  x  x  x	9EEEE)6)>)>tU)K)K&]<Z]]]}""  N  sK  N  N  N  NM9=.C..../3<<]RW<XX\\^^ 	@  @Vc  @  @  @  @M p\2J\\\	3 	3 	3 	3 	3 	3 	3 '#!   r   c                X   d}d}d}d}t          | j        d          dk    rt          d          S dd| j        k                                    rt          d          S dk    r+d d}|t	                    k     r|dz  }d d}dfd}t          d|||          S )zd
    This validator will suggest to remove a common prefix from the prompt if a long one exist.
       Nprefixr   r$   common_prefixr   rI   r   r!   c                P    | d         j         t          |          d          | d<   | S r   r   r'   )rI   r   s     r   remove_common_prefixz<common_prompt_prefix_validator.<locals>.remove_common_prefix  s%    koc&kkmm4(r   z"
- All prompts start with prefix `r?   z. Fine-tuning doesn't require the instruction specifying the task, or a few-shot example scenario. Most of the time you should only add the input data into the prompt, and the desired output into the completionRemove prefix `z` from all promptsc                     |           S r=   r   )rI   r   r   s    r   r   z3common_prompt_prefix_validator.<locals>.optional_fn!  s    ++A}===r   common_prompt_prefixrn   )rI   r   r   r   r!   r   rN   )r   rA   r   r   r'   )r   MAX_PREFIX_LENr   r   r   r   r   s        @@r   common_prompt_prefix_validatorr     s#    NMLK#BIH===M0000    		]"'')) 10000NmNNNC....  r  rMN]NNNL> > > > > > > ##!	   r   c                l   d}t          | j        d          t                    dk    od         dk    t                    |k     rt          d          S dd| j        k                                    rt          d          S d d}d d}dfd}t          d|||          S )zh
    This validator will suggest to remove a common prefix from the completion if a long one exist.
       r   r   r    r   r   rI   r   	ws_prefixr!   c                p    | d         j         t          |          d          | d<   |rd| d          | d<   | S )NrB   r   r   )rI   r   r   s      r   r   z@common_completion_prefix_validator.<locals>.remove_common_prefix7  sC    L/-c&kkmm<, 	43!L/33AlOr   z&
- All completions start with prefix `z_`. Most of the time you should only add the output data into the completion, without any prefixr   z` from all completionsc                     |           S r=   r   )rI   r   r   r   s    r   r   z7common_completion_prefix_validator.<locals>.optional_fnE  s    ##A}i@@@r   common_completion_prefixrn   )rI   r   r   r   r   r   r!   r   rN   )r   rB   r'   r   r   )r   r   r   r   r   r   r   r   s        @@@r   "common_completion_prefix_validatorr   ,  s(    N#BMAAAMM""Q&B=+;s+BI
=N**0000    	&++-- 10000 mm  m  m  mMJ]JJJLA A A A A A A A '#!	   r   c                :   d}d}d}d}t          |           }|dk    s|dk    rt          d          S t          | j        d          }| j        |k                                    rd| d	| d
}t          d|          S dg d}|D ]8}| j        j                            |d                                          r6|                     dd          }	d"d|dk    r|                    dd          }
d|
 d
}t          |          dk    r	|d|	 d
z  }| j        j        dt          |                    j                            |d                                          r	|d| dz  }nd}|dk    rd|	 d}d#fd}t          d ||||!          S )$z
    This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.
    Nrt   classificationr   r   r   r   z All completions are identical: `zJ`
Ensure completions are different, otherwise the model will just repeat `r?   r   z [END])	r}   .z ENDz***z+++z&&&z$$$z@@@z%%%Fr   r}   r   rI   r   r!   c                &    | dxx         |z  cc<   | S NrB   r   r   s     r   r   z6common_completion_suffix_validator.<locals>.add_suffixv  s    	,6!r   r$   z$
- All completions end with suffix `r   r   z9
  WARNING: Some of your completions contain the suffix `zU` more than once. We suggest that you review your completions and add a unique endingaH  
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.zAdd a suffix ending `z` to all completionsc                     |           S r=   r   r   s    r   r   z7common_completion_suffix_validator.<locals>.optional_fn  r   r   r   r   r   rN   )
r   r   r   rB   r   r   r   r`   r   r'   )r   r   r   r   r   r   r   r   r   r   r   r   r   s              @@r   "common_completion_suffix_validatorr   P  s    IMLKb!!G)))W8H-H-H0000#BMAAAM
&++-- F b}  b  b  R_  b  b  b	9EEEE  
 
 
N (  =%%m5%AAEEGG 	(/77eDD    )6)>)>tU)K)K&a@^aaa}""  N  sK  N  N  N  NM=2M 2 22237@@V[@\\``bb 	  Zg        M d]/G]]]	3 	3 	3 	3 	3 	3 	3 '#!   r   c                    dd}d}d}d}| j         j        dd                                         dk    s| j         j        d         d         dk    rd	}d
}|}t	          d|||          S )z
    This validator will suggest to add a space at the start of the completion if it doesn't already exist. This helps with tokenization.
    rI   r   r!   c                D    | d                              d           | d<   | S )NrB   c                :    |                      d          rdnd| z   S )Nr   r$   )
startswith)r]   s    r   rW   zLcompletions_space_start_validator.<locals>.add_space_start.<locals>.<lambda>  s"    cARAR;[22X[_`:` r   )r_   rV   s    r   add_space_startz:completions_space_start_validator.<locals>.add_space_start  s&    L///0`0`aa,r   Nrx   r   r   z
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detailsz=Add a whitespace character to the beginning of the completioncompletion_space_startrn   rN   )rB   r   nuniquevaluesr   )r   r   r   r   r   s        r   !completions_space_start_validatorr     s    
    LKM	}!$$&&!++r}/CA/Fq/IS/P/P BV%%#!	   r   r-   r   Remediation | Nonec                   dfd}|                               d                                           }|                               d                                           }|dz  |k    rt          dd	 d
 dd d|          S dS )zt
    This validator will suggest to lowercase the column values, if more than a third of letters are uppercase.
    rI   r   r!   c                L    |          j                                         | <   | S r=   r0   )rI   r-   s    r   
lower_casez(lower_case_validator.<locals>.lower_case  s#    fIM''))&	r   c                4    t          d | D                       S )Nc              3  j   K   | ].}|                                 |                                *d V  /dS rx   N)isalphaisupperr;   s     r   	<genexpr>z9lower_case_validator.<locals>.<lambda>.<locals>.<genexpr>  A      0]0]q0]QRQZQZQ\Q\0]0]0]0]0]0]0]r   sumrV   s    r   rW   z&lower_case_validator.<locals>.<lambda>      S0]0]A0]0]0]-]-] r   c                4    t          d | D                       S )Nc              3  j   K   | ].}|                                 |                                *d V  /dS r   )r   islowerr;   s     r   r   z9lower_case_validator.<locals>.<lambda>.<locals>.<genexpr>  r   r   r   rV   s    r   rW   z&lower_case_validator.<locals>.<lambda>  r   r   r
   r   z
- More than a third of your `z%` column/key is uppercase. Uppercase zs tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detailsz'Lowercase all your data in column/key `r?   rn   NrN   )r_   r   r   )r   r-   r   count_uppercount_lowers    `   r   lower_case_validatorr     s    
      V*""#]#]^^bbddKV*""#]#]^^bbddKQ$$ iF  i  iio  i  i  iL6LLL"	
 
 
 	
 4r   fname'tuple[pd.DataFrame | None, Remediation]c                   d}d}d}d}d}t           j                            |           r	 |                                                     d          s'|                                                     d          rk|                                                     d          rdnd\  }}d| d}d| d	}t          j        | |t          
                              d          }n|                                                     d          rgd}d}t          j	        |           }	|	j
        }
t          |
          dk    r|dz  }t          j        | t                                        d          }ne|                                                     d          rd}d}t          | d          5 }|                                }t          j        d |                    d          D             |t                                        d          }ddd           n# 1 swxY w Y   n|                                                     d          rxt          j        | dt                                        d          }t          |          dk    r4d}d}t          j        | t                                        d          }nn|                                                     d          r	 t          j        | dt                                        d          }t          |          dk    r/t          j        | t                                        d          }nd}d}ns# t$          $ r1 t          j        | t                                        d          }Y n9w xY wd }d!| v r&|d"|  d#|                     d!          d$          d%z  }n	|d"|  d&z  }nV# t$          t&          f$ r< |                     d!          d$                                         }d'|  d(| d)| d*}Y n
w xY wd+|  d,}t+          d-|||.          }||fS )/z
    This function will read a file saved in .csv, .json, .txt, .xlsx or .tsv format using pandas.
     - for .xlsx it will read the first sheet
     - for .txt it will assume completions and split on newline
    Nz.csvz.tsv)CSV,)TSV	z=
- Based on your file extension, your file is formatted as a z filezYour format `z` will be converted to `JSONL`)sepdtyper$   z.xlsxzH
- Based on your file extension, your file is formatted as an Excel filez/Your format `XLSX` will be converted to `JSONL`rx   z
- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet...)r   z.txtz9
- Based on your file extension, you provided a text filez.Your format `TXT` will be converted to `JSONL`rc                    g | ]}d |gS )r$   r   )r2   lines     r   r4   z#read_any_format.<locals>.<listcomp>  s    DDD"dDDDr   r}   )r5   r   .jsonlT)linesr   z^
- Your JSONL file appears to be in a JSON format. Your file will be converted to JSONL formatz/Your format `JSON` will be converted to `JSONL`z.jsonz^
- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL formatz]Your file must have one of the following extensions: .CSV, .TSV, .XLSX, .TXT, .JSON or .JSONLr   z Your file `z` ends with the extension `.z` which is not supported.z` is missing a file extension.zYour file `z!` does not appear to be in valid z9 format. Please ensure your file is formatted as a valid z file.zFile z does not exist.read_any_format)r   r   r   r   )ospathisfiler1   endswithpdread_csvr   fillna	ExcelFilesheet_namesr'   
read_excelopenread	DataFramesplit	read_json
ValueError	TypeErrorupperr   )r   rC   remediationr   r   r   r   file_extension_str	separatorxlssheetsfcontents                r   r   r     s    KMMI	B	w~~e @4<	v{{}}%%f-- 7V1G1G1O1O 7V@E@V@VW]@^@^0qdq-"InUgnnn  !c0B b b b[ISAAAHHLL''00 0V k Ql5))v;;??!  &N  NM]5444;;B??''// (V \ P%%% !ffhhGDDd0C0CDDD &!   fRjj	 ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ''11 V\%t3???FFrJJr77a<< %FM$UMe3777>>rBBBB''00 VCe4sCCCJJ2NNB2ww!||\%s;;;BB2FF )J(Y! C C Ce3777>>rBBBBBC
 t  %<<  "C  "C  "CSXS^S^_bScScdfSg  "C  "C  "C  CII!U!U!U!UUII& 	v 	v 	v!&S!1!1"!5!;!;!=!= ve  v  vN`  v  v  \n  v  v  vIII	v
 4E333	##	  K {?s^   FO .A!HO HO "H#CO /A5M% $O %8N O N  8O A
P&%P&c                ^    t          |           }d}|dk    rd| d}t          d|          S )z
    This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.
    It will also suggest to use ada and explain train/validation split benefits.
    Nr   zK
- Based on your data it seems like you're trying to fine-tune a model for z
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for trainingr%   r&   )r   r   )r   r   r   s      r   format_inferrer_validatorr
    sR    
 b!!GM""" Vgn  V  V  VN-HHHHr   r  c                *   |j         Dt          j                            d|j         d|j          d           t          j        d           |j        $t          j                            |j                   |j        |                    |           } | S )zs
    This function will apply a necessary remediation to a dataframe, or print an error message if one exists.
    Nz

ERROR in z validator: z

Aborting...rx   )	r   r~   stderrr   r   exitr   r   r   )r   r  s     r   apply_necessary_remediationr  (  s     (
m)9mm{G\mmmnnn ,
2333+%%b))Ir   
input_textauto_acceptboolc                    t           j                            |            |r!t           j                            d           dS t                                                      dk    S )NzY
Tn)r~   r   r   inputr1   )r  r  s     r   accept_suggestionr  6  sR    JZ    
t77==??c!!r   tuple[pd.DataFrame, bool]c                    d}d|j          d}|j         0t          ||          r |j        J |                    |           } d}|j        (t          j                            d|j         d           | |fS )zc
    This function will apply an optional remediation to a dataframe, based on the user input.
    Fz- [Recommended] z [Y/n]: NTz- [Necessary] r}   )r   r  r   r   r~   r   r   )r   r  r  optional_appliedr  s        r   apply_optional_remediationr  >  s     FK$<FFFJ+Z55 	$*666((,,B# ,
G+*CGGGHHHr   Nonec                "   t          |           }d}|dk    rt          |           }|dz  }n-|                     d                                          }|dz  }dd} ||dz             }t          j                            d| d           dS )z?
    Estimate the time it'll take to fine-tune the dataset
    g      ?r   g
ףp=
?T)rc   g|?5^?timefloatr!   r   c                    | dk     rt          | d           dS | dk     rt          | dz  d           dS | dk     rt          | dz  d           dS t          | dz  d           dS )	N<   r
   z secondsi  z minutesiQ z hoursz days)round)r  s    r   format_timez.estimate_fine_tuning_time.<locals>.format_time]  s    "99D!nn....D[[D2Iq))3333E\\D4K++3333D5L!,,3333r      z:Once your model starts training, it'll approximately take z~ to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.
N)r  r  r!   r   )r   r'   memory_usager   r~   r   r   )r   	ft_formatexpected_timer%   sizer!  time_strings          r   estimate_fine_tuning_timer(  P  s      ##IM$$$2ww$t+T**..00v4 4 4 4 +mc122KJ 	R[  	R  	R  	R    r   r   c                     |rddgndg}d}	 |dk    rd| dnd fd|D             }t          d	 |D                       s|S |d
z  }>)N_train_validr$   r   Tz ()c                j    g | ]/}t           j                                      d           d|  d0S )r   	_preparedr   )r   r   splitext)r2   r   r   index_suffixs     r   r4   z!get_outfnames.<locals>.<listcomp>r  sE    xxxekrw//66q9``F`L```xxxr   c              3  T   K   | ]#}t           j                            |          V  $d S r=   )r   r   r   )r2   r  s     r   r   z get_outfnames.<locals>.<genexpr>s  s0      ??27>>!$$??????r   rx   )r`   )r   r   suffixesicandidate_fnamesr0  s   `    @r   get_outfnamesr5  m  s    ',6(##2$H	A$%EEyAyyyyrxxxxxowxxx??.>????? 	$##	Qr   tuple[int, object]c                    | j                                         }d }|dk    r$| j                                         j        d         }||fS )Nr
   r   )rB   r   value_countsrc   )r   	n_classes	pos_classs      r   get_classification_hyperparamsr;  x  sI    %%''IIA~~M..006q9	ir   any_remediationsc                    t          |           }t          | j        d          }t          | j        d          }d}d}|dk    rt	          ||          rd}d}	|                    dd	          }
|                    dd	          }t          |          d
k    rd| dnd}d}|s?|s=t          j        	                    d| d|	 d|
 d| d	           t          |            dS t	          ||          rt          ||          }|rt          |          dk    rd|d
         v r
d|d         v sJ d}t          t          |           |z
  t          t          |           dz                      }|                     |d          }|                     |j                  }|ddg                             |d
         dddd           |ddg                             |d         dddd           t%          |           \  }}|	dz  }	|dk    r
|	d | dz  }	nF|	d!| z  }	n=t          |          dk    sJ | ddg                             |d
         dddd           |rd"ndd#z   d$                    |          z   }|rd%|d          dnd}t          |
          d
k    rdnd&|
 d}t          j        	                    d'| d(|d
          d| |	 d)| | d           t          |            dS t          j        	                    d*           dS )+aQ  
    This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file.
    For classification it will optionally ask the user if they would like to split the data into train/valid files, and modify the suggested command to include the valid set.
    r   r   FzQ- [Recommended] Would you like to split into training and validation set? [Y/n]: r   Tr$   r}   r   r   z Make sure to include `stop=["z;"]` so that the generated texts ends at the expected place.z@

Your data will be written to a new JSONL file. Proceed [Y/n]: zK
You can use your file for fine-tuning:
> openai api fine_tunes.create -t ""ue   

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `zX` for the model to start generating completions, rather than continuing with the prompt.r
   trainvalidrx   i  g?*   )r  random_staterA   rB   recordsN)r   orientforce_asciiindentz! --compute_classification_metricsz" --classification_positive_class "z --classification_n_classes r]   z to `z` and `z -v "uc   After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `z
Wrote modified filezd`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "z

z#Aborting... did not write the file
)r   r   rA   rB   r  r   r'   r~   r   r   r(  r5  maxintsampler   rc   to_jsonr;  rp   )r   r   r<  r  r$  common_prompt_suffixr   r   r  additional_params%common_prompt_suffix_new_line_handled)common_completion_suffix_new_line_handledoptional_ending_stringfnamesMAX_VALID_EXAMPLESn_traindf_traindf_validr9  r:  files_stringvalid_stringseparator_reminders                          r   write_out_filerX    s   
  ##I*298DDD.r}8LLLEdJ$$$Z55 	E,@,H,Hu,U,U)0H0P0PQUW\0]0]- 899A== 	P)R  	P  	P  	P  	P  VJ .AE .A
 x\a  x  xdu  x  x  _D  x  x  ^t  x  x  x	
 	
 	
 	""%%%%%	:{	3	3 (Aue,, 	v;;!##6!9(<(<FSTIAUAUAUU!%#b''$66CGGcM8J8JKKGyy7y<<Hwwx~..Hh-.66q	iUSW 7    h-.66q	iUSW 7    $B"#E#E Iy!DDA~~!%V)%V%V%VV!!!%OI%O%OO!!v;;!####,'(00q	iUSW 1   
  %,"79>>&;Q;QR/4<+vay++++" 899Q>> B w  x]  w  w  w 	
 	
 zL  z  z  [a  bc  [d  z  z  gs  z  uF  z  z  L^  z  `v  z  z  z	
 	
 	
 	""%%%%%
?@@@@@r   c                    d}t          | j        j                                                  dk    rdS t          | j                                                  t          |           |z  k     rdS dS )z>
    Infer the likely fine-tuning task type from the data
       r   rt   r   zconditional generation)r   rA   r   r'   rB   unique)r   CLASSIFICATION_THRESHOLDs     r   r   r     sl      !
29=1$$&&
2=!!""SWW/G%GGG##r   r   seriesr   c                   d}	 |dk    r | j         t          |          dz    d         n| j         dt          |          dz            }|                                dk    rn ||j        d         k    rn|j        d         }~|S )zQ
    Finds the longest common suffix or prefix of all the values in a series
    r$   Tr   rx   Nr   )r   r'   r   r   )r]  r   common_xfixcommon_xfixess       r   r   r     s     K	259X5E5EFJ[))A-.00116:VlX[\gXhXhklXlVlKm 	   ""a''M0333'.q1K	2 r   z,Callable[[pd.DataFrame], Remediation | None]r	   	Validatorlist[Validator]c                     t           d d t          t          t          t          t
          d d t          t          t          t          t          gS )Nc                "    t          | d          S r   r@   rV   s    r   rW   z get_validators.<locals>.<lambda>  s    ,Q99 r   c                "    t          | d          S r   re  rV   s    r   rW   z get_validators.<locals>.<lambda>  s    ,Q== r   c                "    t          | d          S r   r   rV   s    r   rW   z get_validators.<locals>.<lambda>  s    &q(33 r   c                "    t          | d          S r   rh  rV   s    r   rW   z get_validators.<locals>.<lambda>  s    &q,77 r   )r*   rR   rg   r
  rr   r   r   r   r   r   r   r   r   r   get_validatorsrj    sE    99==#!!!3377&&**) r   
validatorswrite_out_file_funcCallable[..., Any]c                   g }||                     |           |D ]4} ||           }|%|                     |           t          | |          } 5t          d |D                       }t          d |D                       }	d}
|r=t          j                            d           |D ]}t          | ||          \  } }|
p|}
nt          j                            d           |
p|	} || |||           d S )Nc                .    g | ]}|j         |j        |S r=   )r   r   r2   r  s     r   r4   z$apply_validators.<locals>.<listcomp>  s3     	
 	
 	
'3{7P7\ 7\7\7\r   c                     g | ]}|j         	|S r=   )r   rp  s     r   r4   z$apply_validators.<locals>.<listcomp>  s     gggAZAfAfAfAfr   Fz?

Based on the analysis we will perform the following actions:
z

No remediations found.
)appendr  r`   r~   r   r   r  )r   r   r  rk  r  rl  optional_remediations	validator&any_optional_or_necessary_remediationsany_necessary_appliedany_optional_appliedr  !any_optional_or_necessary_applieds                r   apply_validatorsry    sa    02$$[111 > >	imm"!((555,R==B-0	
 	
4	
 	
 	
. .*  gg(=ggg  !- 9
]^^^0 	L 	LK#=b+{#[#[ B #7#K;K  	L 	
7888(<(U@U%E#DkRRRRRr   )r   r    r!   r   )r   r    r+   r   r!   r   )r   r    rC   rD   r!   r   )rB   )r   r    rS   r   r!   r   )r   r    r-   r   r!   r   )r   r   rC   rD   r!   r   )r   r   r  r   r!   r   )r  r   r  r  r!   r  )r   r    r  r   r  r  r!   r  )r   r    r!   r  )r   r   r   r  r!   rD   )r   r    r!   r6  )
r   r    r   r   r<  r  r  r  r!   r  )r   r    r!   r   )r   )r]  r   r   r   r!   r   )r!   rb  )r   r    r   r   r  r   rk  rb  r  r  rl  rm  r!   r  ),
__future__r   r   r~   typingr   r   r   r   r   typing_extensionsr	   _extrasr   r   r   r   r*   r@   rR   rg   rr   r   r   r   r   r   r   r   r   r
  r  r  r  r(  r5  r;  rX  r   r   ra  r   rj  ry  r   r   r   <module>r~     s6   " " " " " " " 				 



 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ' ' ' ' ' ' " " " " " "$ $ $ $ $* $ $ $ W19QRRR I I I I! ! ! !H HPQ]F^     <    4 FN|D\     2" " " "JA A A AH$ $ $ $N! ! ! !HA A A AH   2   . &.|$<V V V V Vr	I 	I 	I 	I   " " " "       $   :          HA HA HA HAV$ $ $ $    $ F	 E E E E   ('S 'S 'S 'S 'S 'Sr   